diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 8083186117f..0f88f30d42c 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,8 +1,9 @@ self-hosted-runner: labels: - builder + - func-tester + - func-tester-aarch64 - fuzzer-unit-tester - stress-tester - style-checker - - func-tester-aarch64 - - func-tester + - style-checker-aarch64 diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 749c248af82..75f8a63368d 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -10,7 +10,7 @@ on: # yamllint disable-line rule:truthy - 'backport/**' jobs: DockerHubPushAarch64: - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | diff --git a/.github/workflows/docs_check.yml b/.github/workflows/docs_check.yml index 633e654d656..d5b56bfef32 100644 --- a/.github/workflows/docs_check.yml +++ b/.github/workflows/docs_check.yml @@ -30,7 +30,7 @@ jobs: python3 run_check.py DockerHubPushAarch64: needs: CheckLabels - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | diff --git a/.github/workflows/docs_release.yml b/.github/workflows/docs_release.yml index 54e1f27ab7c..66838a05552 100644 --- a/.github/workflows/docs_release.yml +++ b/.github/workflows/docs_release.yml @@ -20,7 +20,7 @@ on: # yamllint disable-line rule:truthy workflow_dispatch: jobs: DockerHubPushAarch64: - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 3970e64f959..91b9ea5bf3d 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -9,8 +9,20 @@ on: # yamllint disable-line rule:truthy branches: - 'master' jobs: + PythonUnitTests: + runs-on: [self-hosted, style-checker] + steps: + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Python unit tests + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 -m unittest discover -s . -p '*_test.py' DockerHubPushAarch64: - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | @@ -44,7 +56,7 @@ jobs: name: changed_images_amd64 path: ${{ runner.temp }}/docker_images_check/changed_images_amd64.json DockerHubPush: - needs: [DockerHubPushAmd64, DockerHubPushAarch64] + needs: [DockerHubPushAmd64, DockerHubPushAarch64, PythonUnitTests] runs-on: [self-hosted, style-checker] steps: - name: Clear repository diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 0bd02de48d0..cd8517de8fe 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -31,9 +31,22 @@ jobs: run: | cd "$GITHUB_WORKSPACE/tests/ci" python3 run_check.py + PythonUnitTests: + needs: CheckLabels + runs-on: [self-hosted, style-checker] + steps: + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Python unit tests + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 -m unittest discover -s . -p '*_test.py' DockerHubPushAarch64: needs: CheckLabels - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | @@ -68,7 +81,7 @@ jobs: name: changed_images_amd64 path: ${{ runner.temp }}/docker_images_check/changed_images_amd64.json DockerHubPush: - needs: [DockerHubPushAmd64, DockerHubPushAarch64] + needs: [DockerHubPushAmd64, DockerHubPushAarch64, PythonUnitTests] runs-on: [self-hosted, style-checker] steps: - name: Clear repository diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 4ab2638069c..d916699acc2 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -13,7 +13,7 @@ on: # yamllint disable-line rule:truthy jobs: DockerHubPushAarch64: - runs-on: [self-hosted, func-tester-aarch64] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Clear repository run: | diff --git a/.gitmodules b/.gitmodules index 04d32f4af40..ed023ab348b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -217,6 +217,9 @@ [submodule "contrib/yaml-cpp"] path = contrib/yaml-cpp url = https://github.com/ClickHouse-Extras/yaml-cpp.git +[submodule "contrib/cld2"] + path = contrib/cld2 + url = https://github.com/ClickHouse-Extras/cld2.git [submodule "contrib/libstemmer_c"] path = contrib/libstemmer_c url = https://github.com/ClickHouse-Extras/libstemmer_c.git @@ -247,6 +250,9 @@ [submodule "contrib/sysroot"] path = contrib/sysroot url = https://github.com/ClickHouse-Extras/sysroot.git +[submodule "contrib/nlp-data"] + path = contrib/nlp-data + url = https://github.com/ClickHouse-Extras/nlp-data.git [submodule "contrib/hive-metastore"] path = contrib/hive-metastore url = https://github.com/ClickHouse-Extras/hive-metastore diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c98693510a..3c846cdd51e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,9 +104,8 @@ message (STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) option(USE_STATIC_LIBRARIES "Disable to use shared libraries" ON) -option(MAKE_STATIC_LIBRARIES "Disable to make shared libraries" ${USE_STATIC_LIBRARIES}) -if (NOT MAKE_STATIC_LIBRARIES) +if (NOT USE_STATIC_LIBRARIES) # DEVELOPER ONLY. # Faster linking if turned on. option(SPLIT_SHARED_LIBRARIES "Keep all internal libraries as separate .so files") @@ -115,11 +114,11 @@ if (NOT MAKE_STATIC_LIBRARIES) "Make several binaries (clickhouse-server, clickhouse-client etc.) instead of one bundled") endif () -if (MAKE_STATIC_LIBRARIES AND SPLIT_SHARED_LIBRARIES) - message(FATAL_ERROR "Defining SPLIT_SHARED_LIBRARIES=1 without MAKE_STATIC_LIBRARIES=0 has no effect.") +if (USE_STATIC_LIBRARIES AND SPLIT_SHARED_LIBRARIES) + message(FATAL_ERROR "Defining SPLIT_SHARED_LIBRARIES=1 without USE_STATIC_LIBRARIES=0 has no effect.") endif() -if (NOT MAKE_STATIC_LIBRARIES AND SPLIT_SHARED_LIBRARIES) +if (NOT USE_STATIC_LIBRARIES AND SPLIT_SHARED_LIBRARIES) set(BUILD_SHARED_LIBS 1 CACHE INTERNAL "") endif () @@ -201,21 +200,13 @@ endif () option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" ON) option(ENABLE_EXAMPLES "Build all example programs in 'examples' subdirectories" OFF) -if (OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64) AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND NOT USE_MUSL) +if (OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64) AND USE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND NOT USE_MUSL) # Only for Linux, x86_64 or aarch64. option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON) elseif(GLIBC_COMPATIBILITY) message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration") endif () -if (GLIBC_COMPATIBILITY) - # NOTE: we may also want to check glibc version and add -include only for 2.32+ - # however this is extra complexity, especially for cross compiling. - # And anyway it should not break anything for <2.32. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${CMAKE_CURRENT_SOURCE_DIR}/base/glibc-compatibility/glibc-compat-2.32.h") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${CMAKE_CURRENT_SOURCE_DIR}/base/glibc-compatibility/glibc-compat-2.32.h") -endif() - # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") @@ -256,8 +247,6 @@ endif() if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") set(USE_DEBUG_HELPERS ON) -else () - set(USE_DEBUG_HELPERS ON) endif() option(USE_DEBUG_HELPERS "Enable debug helpers" ${USE_DEBUG_HELPERS}) @@ -412,17 +401,6 @@ else () option(WERROR "Enable -Werror compiler option" ON) endif () -if (WERROR) - # Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks. - # Instead, adopt modern cmake usage requirement. - target_compile_options(global-libs INTERFACE "-Werror") -endif () - -# Make this extra-checks for correct library dependencies. -if (OS_LINUX AND NOT SANITIZE) - target_link_options(global-libs INTERFACE "-Wl,--no-undefined") -endif () - # Increase stack size on Musl. We need big stack for our recursive-descend parser. if (USE_MUSL) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,stack-size=2097152") @@ -430,6 +408,7 @@ endif () include(cmake/dbms_glob_sources.cmake) +add_library(global-group INTERFACE) if (OS_LINUX OR OS_ANDROID) include(cmake/linux/default_libs.cmake) elseif (OS_DARWIN) @@ -437,6 +416,18 @@ elseif (OS_DARWIN) elseif (OS_FREEBSD) include(cmake/freebsd/default_libs.cmake) endif () +link_libraries(global-group) + +if (WERROR) + # Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks. + # Instead, adopt modern cmake usage requirement. + target_compile_options(global-group INTERFACE "-Werror") +endif () + +# Make this extra-checks for correct library dependencies. +if (OS_LINUX AND NOT SANITIZE) + target_link_options(global-group INTERFACE "-Wl,--no-undefined") +endif () ###################################### ### Add targets below this comment ### @@ -444,7 +435,7 @@ endif () set (CMAKE_POSTFIX_VARIABLE "CMAKE_${CMAKE_BUILD_TYPE_UC}_POSTFIX") -if (MAKE_STATIC_LIBRARIES) +if (USE_STATIC_LIBRARIES) set (CMAKE_POSITION_INDEPENDENT_CODE OFF) if (OS_LINUX AND NOT ARCH_ARM) # Slightly more efficient code can be generated @@ -480,7 +471,6 @@ endif () message (STATUS "Building for: ${CMAKE_SYSTEM} ${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_LIBRARY_ARCHITECTURE} ; USE_STATIC_LIBRARIES=${USE_STATIC_LIBRARIES} - MAKE_STATIC_LIBRARIES=${MAKE_STATIC_LIBRARIES} SPLIT_SHARED=${SPLIT_SHARED_LIBRARIES} CCACHE=${CCACHE_FOUND} ${CCACHE_VERSION}") @@ -528,7 +518,7 @@ macro (add_executable target) # - _je_zone_register due to JEMALLOC_PRIVATE_NAMESPACE=je_ under OS X. # - but jemalloc-cmake does not run private_namespace.sh # so symbol name should be _zone_register - if (ENABLE_JEMALLOC AND MAKE_STATIC_LIBRARIES AND OS_DARWIN) + if (ENABLE_JEMALLOC AND USE_STATIC_LIBRARIES AND OS_DARWIN) set_property(TARGET ${target} APPEND PROPERTY LINK_OPTIONS -u_zone_register) endif() endif() @@ -547,6 +537,4 @@ add_subdirectory (programs) add_subdirectory (tests) add_subdirectory (utils) -include (cmake/print_include_directories.cmake) - include (cmake/sanitize_target_link_libraries.cmake) diff --git a/LICENSE b/LICENSE index 80dbd30140b..8b0ac080f01 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2016-2021 ClickHouse, Inc. +Copyright 2016-2022 ClickHouse, Inc. Apache License Version 2.0, January 2004 @@ -188,7 +188,7 @@ Copyright 2016-2021 ClickHouse, Inc. same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2016-2021 ClickHouse, Inc. + Copyright 2016-2022 ClickHouse, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 9201a852373..3e6f174c6dc 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -42,7 +42,7 @@ endif () target_include_directories(common PUBLIC .. "${CMAKE_CURRENT_BINARY_DIR}/..") -if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) +if (OS_DARWIN AND NOT USE_STATIC_LIBRARIES) target_link_libraries(common PUBLIC -Wl,-U,_inside_main) endif() diff --git a/base/base/LineReader.cpp b/base/base/LineReader.cpp index 9491f957762..d028dace52a 100644 --- a/base/base/LineReader.cpp +++ b/base/base/LineReader.cpp @@ -2,7 +2,9 @@ #include #include +#include +#include #include #include #include @@ -34,13 +36,37 @@ bool hasInputData() return select(1, &fds, nullptr, nullptr, &timeout) == 1; } +struct NoCaseCompare +{ + bool operator()(const std::string & str1, const std::string & str2) + { + return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char c1, const char c2) + { + return std::tolower(c1) < std::tolower(c2); + }); + } +}; + +using Words = std::vector; +template +void addNewWords(Words & to, const Words & from, Compare comp) +{ + size_t old_size = to.size(); + size_t new_size = old_size + from.size(); + + to.reserve(new_size); + to.insert(to.end(), from.begin(), from.end()); + auto middle = to.begin() + old_size; + std::inplace_merge(to.begin(), middle, to.end(), comp); + + auto last_unique = std::unique(to.begin(), to.end()); + to.erase(last_unique, to.end()); } -std::optional LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length) const -{ - if (!ready) - return std::nullopt; +} +replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length) +{ std::string_view last_word; auto last_word_pos = prefix.find_last_of(word_break_characters); @@ -48,21 +74,45 @@ std::optional LineReader::Suggest::getCompletio last_word = prefix; else last_word = std::string_view(prefix).substr(last_word_pos + 1, std::string::npos); - /// last_word can be empty. + std::pair range; + + std::lock_guard lock(mutex); + /// Only perform case sensitive completion when the prefix string contains any uppercase characters if (std::none_of(prefix.begin(), prefix.end(), [&](auto c) { return c >= 'A' && c <= 'Z'; })) - return std::equal_range( + range = std::equal_range( words_no_case.begin(), words_no_case.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) { return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; }); else - return std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + range = std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) { return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0; }); + + return replxx::Replxx::completions_t(range.first, range.second); +} + +void LineReader::Suggest::addWords(Words && new_words) +{ + Words new_words_no_case = new_words; + if (!new_words.empty()) + { + std::sort(new_words.begin(), new_words.end()); + std::sort(new_words_no_case.begin(), new_words_no_case.end(), NoCaseCompare{}); + } + + { + std::lock_guard lock(mutex); + addNewWords(words, new_words, std::less{}); + addNewWords(words_no_case, new_words_no_case, NoCaseCompare{}); + } + + assert(std::is_sorted(words.begin(), words.end())); + assert(std::is_sorted(words_no_case.begin(), words_no_case.end(), NoCaseCompare{})); } LineReader::LineReader(const String & history_file_path_, bool multiline_, Patterns extenders_, Patterns delimiters_) diff --git a/base/base/LineReader.h b/base/base/LineReader.h index 12a856e2051..33daae49974 100644 --- a/base/base/LineReader.h +++ b/base/base/LineReader.h @@ -1,10 +1,12 @@ #pragma once -#include - +#include #include #include #include +#include + +#include class LineReader { @@ -12,14 +14,16 @@ public: struct Suggest { using Words = std::vector; - using WordsRange = std::pair; + /// Get vector for the matched range of words if any. + replxx::Replxx::completions_t getCompletions(const String & prefix, size_t prefix_length); + void addWords(Words && new_words); + + private: Words words; Words words_no_case; - std::atomic ready{false}; - /// Get iterators for the matched range of words if any. - std::optional getCompletions(const String & prefix, size_t prefix_length) const; + std::mutex mutex; }; using Patterns = std::vector; diff --git a/base/base/ReplxxLineReader.cpp b/base/base/ReplxxLineReader.cpp index 5d99da99c8c..6ba63a00d01 100644 --- a/base/base/ReplxxLineReader.cpp +++ b/base/base/ReplxxLineReader.cpp @@ -133,7 +133,7 @@ void convertHistoryFile(const std::string & path, replxx::Replxx & rx) } ReplxxLineReader::ReplxxLineReader( - const Suggest & suggest, + Suggest & suggest, const String & history_file_path_, bool multiline_, Patterns extenders_, @@ -179,9 +179,7 @@ ReplxxLineReader::ReplxxLineReader( auto callback = [&suggest] (const String & context, size_t context_size) { - if (auto range = suggest.getCompletions(context, context_size)) - return Replxx::completions_t(range->first, range->second); - return Replxx::completions_t(); + return suggest.getCompletions(context, context_size); }; rx.set_completion_callback(callback); diff --git a/base/base/ReplxxLineReader.h b/base/base/ReplxxLineReader.h index d4cc7de1e7a..4a975d2975d 100644 --- a/base/base/ReplxxLineReader.h +++ b/base/base/ReplxxLineReader.h @@ -9,7 +9,7 @@ class ReplxxLineReader : public LineReader { public: ReplxxLineReader( - const Suggest & suggest, + Suggest & suggest, const String & history_file_path, bool multiline, Patterns extenders_, diff --git a/base/daemon/CMakeLists.txt b/base/daemon/CMakeLists.txt index 2a4d3d33ff9..ae8f51cabd3 100644 --- a/base/daemon/CMakeLists.txt +++ b/base/daemon/CMakeLists.txt @@ -6,7 +6,7 @@ add_library (daemon target_include_directories (daemon PUBLIC ..) -if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) +if (OS_DARWIN AND NOT USE_STATIC_LIBRARIES) target_link_libraries (daemon PUBLIC -Wl,-undefined,dynamic_lookup) endif() diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 4fc2a002cd8..ddec09121e1 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -37,7 +37,7 @@ if (GLIBC_COMPATIBILITY) target_include_directories(glibc-compatibility PRIVATE libcxxabi ${musl_arch_include_dir}) - if (NOT USE_STATIC_LIBRARIES AND NOT MAKE_STATIC_LIBRARIES) + if (NOT USE_STATIC_LIBRARIES AND NOT USE_STATIC_LIBRARIES) target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () diff --git a/base/glibc-compatibility/glibc-compat-2.32.h b/base/glibc-compatibility/glibc-compat-2.32.h deleted file mode 100644 index 53ed34d60fa..00000000000 --- a/base/glibc-compatibility/glibc-compat-2.32.h +++ /dev/null @@ -1,50 +0,0 @@ -/// In glibc 2.32 new version of some symbols had been added [1]: -/// -/// $ nm -D clickhouse | fgrep -e @GLIBC_2.32 -/// U pthread_getattr_np@GLIBC_2.32 -/// U pthread_sigmask@GLIBC_2.32 -/// -/// [1]: https://www.spinics.net/lists/fedora-devel/msg273044.html -/// -/// Right now ubuntu 20.04 is used as official image for building -/// ClickHouse, however once it will be switched someone may not be happy -/// with that fact that he/she cannot use official binaries anymore because -/// they have glibc < 2.32. -/// -/// To avoid this dependency, let's force previous version of those -/// symbols from glibc. -/// -/// Also note, that the following approach had been tested: -/// a) -Wl,--wrap -- but it goes into endless recursion whey you try to do -/// something like this: -/// -/// int __pthread_getattr_np_compact(pthread_t thread, pthread_attr_t *attr); -/// GLIBC_COMPAT_SYMBOL(__pthread_getattr_np_compact, pthread_getattr_np) -/// int __pthread_getattr_np_compact(pthread_t thread, pthread_attr_t *attr); -/// int __wrap_pthread_getattr_np(pthread_t thread, pthread_attr_t *attr) -/// { -/// return __pthread_getattr_np_compact(thread, attr); -/// } -/// -/// int __pthread_sigmask_compact(int how, const sigset_t *set, sigset_t *oldset); -/// GLIBC_COMPAT_SYMBOL(__pthread_sigmask_compact, pthread_sigmask) -/// int __pthread_sigmask_compact(int how, const sigset_t *set, sigset_t *oldset); -/// int __wrap_pthread_sigmask(int how, const sigset_t *set, sigset_t *oldset) -/// { -/// return __pthread_sigmask_compact(how, set, oldset); -/// } -/// -/// b) -Wl,--defsym -- same problems (and you cannot use version of symbol with -/// version in the expression) -/// c) this approach -- simply add this file with -include directive. - -#if defined(__amd64__) -#define GLIBC_COMPAT_SYMBOL(func) __asm__(".symver " #func "," #func "@GLIBC_2.2.5"); -#elif defined(__aarch64__) -#define GLIBC_COMPAT_SYMBOL(func) __asm__(".symver " #func "," #func "@GLIBC_2.17"); -#else -#error Your platform is not supported. -#endif - -GLIBC_COMPAT_SYMBOL(pthread_sigmask) -GLIBC_COMPAT_SYMBOL(pthread_getattr_np) diff --git a/cmake/cpu_features.cmake b/cmake/cpu_features.cmake index 20c61ead3d2..535d1b3c93e 100644 --- a/cmake/cpu_features.cmake +++ b/cmake/cpu_features.cmake @@ -134,7 +134,7 @@ else () set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}") endif () - set (TEST_FLAG "-mavx512f -mavx512bw") + set (TEST_FLAG "-mavx512f -mavx512bw -mavx512vl") set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0") check_cxx_source_compiles(" #include @@ -143,6 +143,8 @@ else () (void)a; auto b = _mm512_add_epi16(__m512i(), __m512i()); (void)b; + auto c = _mm_cmp_epi8_mask(__m128i(), __m128i(), 0); + (void)c; return 0; } " HAVE_AVX512) @@ -181,7 +183,7 @@ else () set (X86_INTRINSICS_FLAGS "${X86_INTRINSICS_FLAGS} -mbmi") endif () if (HAVE_AVX512) - set (X86_INTRINSICS_FLAGS "${X86_INTRINSICS_FLAGS} -mavx512f -mavx512bw -mprefer-vector-width=256") + set (X86_INTRINSICS_FLAGS "${X86_INTRINSICS_FLAGS} -mavx512f -mavx512bw -mavx512vl -mprefer-vector-width=256") endif () endif () endif () diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake index a6ee800d59b..ca4beaea8b6 100644 --- a/cmake/darwin/default_libs.cmake +++ b/cmake/darwin/default_libs.cmake @@ -24,14 +24,10 @@ find_package(Threads REQUIRED) include (cmake/find/cxx.cmake) -add_library(global-group INTERFACE) - target_link_libraries(global-group INTERFACE $ ) -link_libraries(global-group) - # FIXME: remove when all contribs will get custom cmake lists install( TARGETS global-group global-libs diff --git a/cmake/find/cxx.cmake b/cmake/find/cxx.cmake index 4a8620930e8..7d93bf05fc7 100644 --- a/cmake/find/cxx.cmake +++ b/cmake/find/cxx.cmake @@ -1,31 +1,8 @@ -option (USE_LIBCXX "Use libc++ and libc++abi instead of libstdc++" ON) - -if (NOT USE_LIBCXX) - target_link_libraries(global-libs INTERFACE -l:libstdc++.a -l:libstdc++fs.a) # Always link these libraries as static - target_link_libraries(global-libs INTERFACE ${EXCEPTION_HANDLING_LIBRARY}) - return() -endif() - set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build. -if (NOT HAVE_LIBCXX AND NOT MISSING_INTERNAL_LIBCXX_LIBRARY) - set (LIBCXX_LIBRARY cxx) - set (LIBCXXABI_LIBRARY cxxabi) - add_subdirectory(contrib/libcxxabi-cmake) - add_subdirectory(contrib/libcxx-cmake) +add_subdirectory(contrib/libcxxabi-cmake) +add_subdirectory(contrib/libcxx-cmake) - # Exception handling library is embedded into libcxxabi. +# Exception handling library is embedded into libcxxabi. - set (HAVE_LIBCXX 1) -endif () - -if (HAVE_LIBCXX) - target_link_libraries(global-libs INTERFACE ${LIBCXX_LIBRARY} ${LIBCXXABI_LIBRARY} ${LIBCXXFS_LIBRARY}) - - message (STATUS "Using libcxx: ${LIBCXX_LIBRARY}") - message (STATUS "Using libcxxfs: ${LIBCXXFS_LIBRARY}") - message (STATUS "Using libcxxabi: ${LIBCXXABI_LIBRARY}") -else() - target_link_libraries(global-libs INTERFACE -l:libstdc++.a -l:libstdc++fs.a) # Always link these libraries as static - target_link_libraries(global-libs INTERFACE ${EXCEPTION_HANDLING_LIBRARY}) -endif() +target_link_libraries(global-libs INTERFACE cxx cxxabi) diff --git a/cmake/freebsd/default_libs.cmake b/cmake/freebsd/default_libs.cmake index a5847c95387..f7a333df6e6 100644 --- a/cmake/freebsd/default_libs.cmake +++ b/cmake/freebsd/default_libs.cmake @@ -25,14 +25,10 @@ find_package(Threads REQUIRED) include (cmake/find/unwind.cmake) include (cmake/find/cxx.cmake) -add_library(global-group INTERFACE) - target_link_libraries(global-group INTERFACE $ ) -link_libraries(global-group) - # FIXME: remove when all contribs will get custom cmake lists install( TARGETS global-group global-libs diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index 426ae482ea3..98951822015 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -45,15 +45,12 @@ endif () include (cmake/find/unwind.cmake) include (cmake/find/cxx.cmake) -add_library(global-group INTERFACE) target_link_libraries(global-group INTERFACE -Wl,--start-group $ -Wl,--end-group ) -link_libraries(global-group) - # FIXME: remove when all contribs will get custom cmake lists install( TARGETS global-group global-libs diff --git a/cmake/print_include_directories.cmake b/cmake/print_include_directories.cmake deleted file mode 100644 index bea02b2b2cc..00000000000 --- a/cmake/print_include_directories.cmake +++ /dev/null @@ -1,29 +0,0 @@ - -# TODO? Maybe recursive collect on all deps - -get_property (dirs1 TARGET dbms PROPERTY INCLUDE_DIRECTORIES) -list(APPEND dirs ${dirs1}) - -get_property (dirs1 TARGET clickhouse_common_io PROPERTY INCLUDE_DIRECTORIES) -list(APPEND dirs ${dirs1}) - -get_property (dirs1 TARGET common PROPERTY INCLUDE_DIRECTORIES) -list(APPEND dirs ${dirs1}) - -get_property (dirs1 TARGET ch_contrib::cityhash PROPERTY INCLUDE_DIRECTORIES) -list(APPEND dirs ${dirs1}) - -get_property (dirs1 TARGET roaring PROPERTY INCLUDE_DIRECTORIES) -list(APPEND dirs ${dirs1}) - -if (TARGET ch_contrib::double_conversion) - get_property (dirs1 TARGET ch_contrib::double_conversion PROPERTY INCLUDE_DIRECTORIES) - list(APPEND dirs ${dirs1}) -endif () - -list(REMOVE_DUPLICATES dirs) -file (WRITE ${CMAKE_CURRENT_BINARY_DIR}/include_directories.txt "") -foreach (dir ${dirs}) - string (REPLACE "${ClickHouse_SOURCE_DIR}" "." dir "${dir}") - file (APPEND ${CMAKE_CURRENT_BINARY_DIR}/include_directories.txt "-I ${dir} ") -endforeach () diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index f052948e731..73610545009 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -23,7 +23,7 @@ if (SANITIZE) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_FLAGS}") endif() - if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (USE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan") endif () if (COMPILER_GCC) @@ -48,7 +48,7 @@ if (SANITIZE) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=memory") endif() - if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (USE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libmsan") endif () @@ -69,7 +69,7 @@ if (SANITIZE) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread") endif() - if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (USE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libtsan") endif () if (COMPILER_GCC) @@ -101,7 +101,7 @@ if (SANITIZE) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined") endif() - if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (USE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libubsan") endif () if (COMPILER_GCC) diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index ecc31529dc4..2093d3dcc87 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -55,11 +55,6 @@ if (COMPILER_CLANG) no_warning(weak-template-vtables) no_warning(weak-vtables) - # XXX: libstdc++ has some of these for 3way compare - if (NOT USE_LIBCXX) - no_warning(zero-as-null-pointer-constant) - endif() - # TODO Enable conversion, sign-conversion, double-promotion warnings. else () add_warning(comma) @@ -98,10 +93,7 @@ if (COMPILER_CLANG) add_warning(tautological-bitwise-compare) # XXX: libstdc++ has some of these for 3way compare - if (USE_LIBCXX) - add_warning(zero-as-null-pointer-constant) - endif() - + add_warning(zero-as-null-pointer-constant) endif () elseif (COMPILER_GCC) # Add compiler options only to c++ compiler @@ -183,11 +175,8 @@ elseif (COMPILER_GCC) add_cxx_compile_options(-Wundef) # Warn if vector operation is not implemented via SIMD capabilities of the architecture add_cxx_compile_options(-Wvector-operation-performance) - # XXX: libstdc++ has some of these for 3way compare - if (USE_LIBCXX) - # Warn when a literal 0 is used as null pointer constant. - add_cxx_compile_options(-Wzero-as-null-pointer-constant) - endif() + # Warn when a literal 0 is used as null pointer constant. + add_cxx_compile_options(-Wzero-as-null-pointer-constant) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) # XXX: gcc10 stuck with this option while compiling GatherUtils code diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b748818b66d..6172f231b6e 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -140,6 +140,8 @@ if (ENABLE_NLP) add_contrib (libstemmer-c-cmake libstemmer_c) add_contrib (wordnet-blast-cmake wordnet-blast) add_contrib (lemmagen-c-cmake lemmagen-c) + add_contrib (nlp-data-cmake nlp-data) + add_contrib (cld2-cmake cld2) endif() add_contrib (sqlite-cmake sqlite-amalgamation) diff --git a/contrib/NuRaft b/contrib/NuRaft index c2043aa250e..1707a7572aa 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit c2043aa250e53ad5cf75e596e319d587af4dcb3c +Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index 65e4c24ff5a..4fb02327d17 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -6,15 +6,17 @@ set(BUILD_TESTING OFF) set(ABSL_PROPAGATE_CXX_STD ON) add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp") -add_library(abseil_swiss_tables INTERFACE) +add_library(_abseil_swiss_tables INTERFACE) -target_link_libraries(abseil_swiss_tables INTERFACE +target_link_libraries(_abseil_swiss_tables INTERFACE absl::flat_hash_map absl::flat_hash_set ) get_target_property(FLAT_HASH_MAP_INCLUDE_DIR absl::flat_hash_map INTERFACE_INCLUDE_DIRECTORIES) -target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_MAP_INCLUDE_DIR}) +target_include_directories (_abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_MAP_INCLUDE_DIR}) get_target_property(FLAT_HASH_SET_INCLUDE_DIR absl::flat_hash_set INTERFACE_INCLUDE_DIRECTORIES) -target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_SET_INCLUDE_DIR}) +target_include_directories (_abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_SET_INCLUDE_DIR}) + +add_library(ch_contrib::abseil_swiss_tables ALIAS _abseil_swiss_tables) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 5e500877f3c..54bfead6da7 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -29,7 +29,7 @@ if (OS_FREEBSD) message (FATAL_ERROR "Using internal parquet library on FreeBSD is not supported") endif() -if(MAKE_STATIC_LIBRARIES) +if(USE_STATIC_LIBRARIES) set(FLATBUFFERS_LIBRARY flatbuffers) else() set(FLATBUFFERS_LIBRARY flatbuffers_shared) @@ -84,7 +84,7 @@ set(FLATBUFFERS_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/flatbuffers") set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SRC_DIR}/include") # set flatbuffers CMake options -if (MAKE_STATIC_LIBRARIES) +if (USE_STATIC_LIBRARIES) set(FLATBUFFERS_BUILD_FLATLIB ON CACHE BOOL "Enable the build of the flatbuffers library") set(FLATBUFFERS_BUILD_SHAREDLIB OFF CACHE BOOL "Disable the build of the flatbuffers shared library") else () diff --git a/contrib/aws-s3-cmake/CMakeLists.txt b/contrib/aws-s3-cmake/CMakeLists.txt index 9c0b3ce192c..de6486e58fd 100644 --- a/contrib/aws-s3-cmake/CMakeLists.txt +++ b/contrib/aws-s3-cmake/CMakeLists.txt @@ -91,30 +91,30 @@ set(S3_INCLUDES "${CMAKE_CURRENT_BINARY_DIR}/include/" ) -add_library(aws_s3_checksums ${AWS_CHECKSUMS_SOURCES}) -target_include_directories(aws_s3_checksums SYSTEM PUBLIC "${AWS_CHECKSUMS_LIBRARY_DIR}/include/") +add_library(_aws_s3_checksums ${AWS_CHECKSUMS_SOURCES}) +target_include_directories(_aws_s3_checksums SYSTEM PUBLIC "${AWS_CHECKSUMS_LIBRARY_DIR}/include/") if(CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") - target_compile_definitions(aws_s3_checksums PRIVATE "-DDEBUG_BUILD") + target_compile_definitions(_aws_s3_checksums PRIVATE "-DDEBUG_BUILD") endif() -set_target_properties(aws_s3_checksums PROPERTIES LINKER_LANGUAGE C) -set_property(TARGET aws_s3_checksums PROPERTY C_STANDARD 99) +set_target_properties(_aws_s3_checksums PROPERTIES LINKER_LANGUAGE C) +set_property(TARGET _aws_s3_checksums PROPERTY C_STANDARD 99) -add_library(aws_s3 ${S3_UNIFIED_SRC}) +add_library(_aws_s3 ${S3_UNIFIED_SRC}) -target_compile_definitions(aws_s3 PUBLIC "AWS_SDK_VERSION_MAJOR=1") -target_compile_definitions(aws_s3 PUBLIC "AWS_SDK_VERSION_MINOR=7") -target_compile_definitions(aws_s3 PUBLIC "AWS_SDK_VERSION_PATCH=231") -target_include_directories(aws_s3 SYSTEM BEFORE PUBLIC ${S3_INCLUDES}) +target_compile_definitions(_aws_s3 PUBLIC "AWS_SDK_VERSION_MAJOR=1") +target_compile_definitions(_aws_s3 PUBLIC "AWS_SDK_VERSION_MINOR=7") +target_compile_definitions(_aws_s3 PUBLIC "AWS_SDK_VERSION_PATCH=231") +target_include_directories(_aws_s3 SYSTEM BEFORE PUBLIC ${S3_INCLUDES}) if (TARGET OpenSSL::SSL) - target_compile_definitions(aws_s3 PUBLIC -DENABLE_OPENSSL_ENCRYPTION) - target_link_libraries(aws_s3 PRIVATE OpenSSL::Crypto OpenSSL::SSL) + target_compile_definitions(_aws_s3 PUBLIC -DENABLE_OPENSSL_ENCRYPTION) + target_link_libraries(_aws_s3 PRIVATE OpenSSL::Crypto OpenSSL::SSL) endif() -target_link_libraries(aws_s3 PRIVATE aws_s3_checksums) +target_link_libraries(_aws_s3 PRIVATE _aws_s3_checksums) # The library is large - avoid bloat. -target_compile_options (aws_s3 PRIVATE -g0) -target_compile_options (aws_s3_checksums PRIVATE -g0) +target_compile_options (_aws_s3 PRIVATE -g0) +target_compile_options (_aws_s3_checksums PRIVATE -g0) -add_library(ch_contrib::aws_s3 ALIAS aws_s3) +add_library(ch_contrib::aws_s3 ALIAS _aws_s3) diff --git a/contrib/base64-cmake/CMakeLists.txt b/contrib/base64-cmake/CMakeLists.txt index 2443c899869..69040a9bedc 100644 --- a/contrib/base64-cmake/CMakeLists.txt +++ b/contrib/base64-cmake/CMakeLists.txt @@ -11,37 +11,37 @@ endif() SET(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/base64") -add_library(base64_scalar OBJECT "${LIBRARY_DIR}/turbob64c.c" "${LIBRARY_DIR}/turbob64d.c") -add_library(base64_ssse3 OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This file also contains code for ARM NEON +add_library(_base64_scalar OBJECT "${LIBRARY_DIR}/turbob64c.c" "${LIBRARY_DIR}/turbob64d.c") +add_library(_base64_ssse3 OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This file also contains code for ARM NEON if (ARCH_AMD64) - add_library(base64_avx OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This is not a mistake. One file is compiled twice. - add_library(base64_avx2 OBJECT "${LIBRARY_DIR}/turbob64avx2.c") + add_library(_base64_avx OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This is not a mistake. One file is compiled twice. + add_library(_base64_avx2 OBJECT "${LIBRARY_DIR}/turbob64avx2.c") endif () -target_compile_options(base64_scalar PRIVATE -falign-loops) +target_compile_options(_base64_scalar PRIVATE -falign-loops) if (ARCH_AMD64) - target_compile_options(base64_ssse3 PRIVATE -mno-avx -mno-avx2 -mssse3 -falign-loops) - target_compile_options(base64_avx PRIVATE -falign-loops -mavx) - target_compile_options(base64_avx2 PRIVATE -falign-loops -mavx2) + target_compile_options(_base64_ssse3 PRIVATE -mno-avx -mno-avx2 -mssse3 -falign-loops) + target_compile_options(_base64_avx PRIVATE -falign-loops -mavx) + target_compile_options(_base64_avx2 PRIVATE -falign-loops -mavx2) else () - target_compile_options(base64_ssse3 PRIVATE -falign-loops) + target_compile_options(_base64_ssse3 PRIVATE -falign-loops) endif () if (ARCH_AMD64) - add_library(base64 - $ - $ - $ - $) + add_library(_base64 + $ + $ + $ + $) else () - add_library(base64 - $ - $) + add_library(_base64 + $ + $) endif () -target_include_directories(base64 SYSTEM PUBLIC ${LIBRARY_DIR}) +target_include_directories(_base64 SYSTEM PUBLIC ${LIBRARY_DIR}) if (XCODE OR XCODE_VERSION) # https://gitlab.kitware.com/cmake/cmake/issues/17457 @@ -50,7 +50,7 @@ if (XCODE OR XCODE_VERSION) if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/dummy.c") file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/dummy.c" "") endif () - target_sources(base64 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/dummy.c") + target_sources(_base64 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/dummy.c") endif () -add_library(ch_contrib::base64 ALIAS base64) +add_library(ch_contrib::base64 ALIAS _base64) diff --git a/contrib/boringssl-cmake/CMakeLists.txt b/contrib/boringssl-cmake/CMakeLists.txt index 3ce78493576..dd3332d70be 100644 --- a/contrib/boringssl-cmake/CMakeLists.txt +++ b/contrib/boringssl-cmake/CMakeLists.txt @@ -368,7 +368,7 @@ elseif(WIN32) endif() add_library( - crypto + _crypto ${CRYPTO_ARCH_SOURCES} err_data.c @@ -605,7 +605,7 @@ add_library( ) add_library( - ssl + _ssl "${BORINGSSL_SOURCE_DIR}/ssl/bio_ssl.cc" "${BORINGSSL_SOURCE_DIR}/ssl/d1_both.cc" @@ -672,21 +672,22 @@ add_executable( "${BORINGSSL_SOURCE_DIR}/tool/transport_common.cc" ) -target_link_libraries(ssl crypto) -target_link_libraries(bssl ssl) +target_link_libraries(_ssl _crypto) +target_link_libraries(bssl _ssl) if(NOT WIN32 AND NOT ANDROID) - target_link_libraries(crypto pthread) + target_link_libraries(_crypto pthread) endif() +# NOTE: that ClickHouse does not support WIN32 anyway. if(WIN32) target_link_libraries(bssl ws2_32) endif() -target_include_directories(crypto SYSTEM PUBLIC "${BORINGSSL_SOURCE_DIR}/include") -target_include_directories(ssl SYSTEM PUBLIC "${BORINGSSL_SOURCE_DIR}/include") +target_include_directories(_crypto SYSTEM PUBLIC "${BORINGSSL_SOURCE_DIR}/include") +target_include_directories(_ssl SYSTEM PUBLIC "${BORINGSSL_SOURCE_DIR}/include") -target_compile_options(crypto PRIVATE -Wno-gnu-anonymous-struct) +target_compile_options(_crypto PRIVATE -Wno-gnu-anonymous-struct) -add_library(OpenSSL::Crypto ALIAS crypto) -add_library(OpenSSL::SSL ALIAS ssl) +add_library(OpenSSL::Crypto ALIAS _crypto) +add_library(OpenSSL::SSL ALIAS _ssl) diff --git a/contrib/capnproto-cmake/CMakeLists.txt b/contrib/capnproto-cmake/CMakeLists.txt index 7d34a4002af..297b847cd58 100644 --- a/contrib/capnproto-cmake/CMakeLists.txt +++ b/contrib/capnproto-cmake/CMakeLists.txt @@ -36,8 +36,8 @@ set (KJ_SRCS "${CAPNPROTO_SOURCE_DIR}/kj/parse/char.c++" ) -add_library(kj ${KJ_SRCS}) -target_include_directories(kj SYSTEM PUBLIC ${CAPNPROTO_SOURCE_DIR}) +add_library(_kj ${KJ_SRCS}) +target_include_directories(_kj SYSTEM PUBLIC ${CAPNPROTO_SOURCE_DIR}) set (CAPNP_SRCS "${CAPNPROTO_SOURCE_DIR}/capnp/c++.capnp.c++" @@ -58,11 +58,11 @@ set (CAPNP_SRCS "${CAPNPROTO_SOURCE_DIR}/capnp/stringify.c++" ) -add_library(capnp ${CAPNP_SRCS}) -set_target_properties(capnp +add_library(_capnp ${CAPNP_SRCS}) +set_target_properties(_capnp PROPERTIES LINKER_LANGUAGE CXX ) -target_link_libraries(capnp PUBLIC kj) +target_link_libraries(_capnp PUBLIC _kj) set (CAPNPC_SRCS "${CAPNPROTO_SOURCE_DIR}/capnp/compiler/type-id.c++" @@ -78,8 +78,8 @@ set (CAPNPC_SRCS "${CAPNPROTO_SOURCE_DIR}/capnp/serialize-text.c++" ) -add_library(capnpc ${CAPNPC_SRCS}) -target_link_libraries(capnpc PUBLIC capnp) +add_library(_capnpc ${CAPNPC_SRCS}) +target_link_libraries(_capnpc PUBLIC _capnp) # The library has substandard code if (COMPILER_GCC) @@ -89,8 +89,8 @@ elseif (COMPILER_CLANG) set (CAPNP_PRIVATE_CXX_FLAGS -fno-char8_t) endif () -target_compile_options(kj PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) -target_compile_options(capnp PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) -target_compile_options(capnpc PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_kj PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_capnp PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_capnpc PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) -add_library(ch_contrib::capnp ALIAS capnpc) +add_library(ch_contrib::capnp ALIAS _capnpc) diff --git a/contrib/cassandra-cmake/CMakeLists.txt b/contrib/cassandra-cmake/CMakeLists.txt index cc4f12f2e83..416dca6f2bc 100644 --- a/contrib/cassandra-cmake/CMakeLists.txt +++ b/contrib/cassandra-cmake/CMakeLists.txt @@ -53,16 +53,16 @@ endif() list(APPEND SOURCES ${CASS_SRC_DIR}/atomic/atomic_std.hpp) -add_library(curl_hostcheck OBJECT ${CASS_SRC_DIR}/third_party/curl/hostcheck.cpp) -add_library(hdr_histogram OBJECT ${CASS_SRC_DIR}/third_party/hdr_histogram/hdr_histogram.cpp) -add_library(http-parser OBJECT ${CASS_SRC_DIR}/third_party/http-parser/http_parser.c) -add_library(minizip OBJECT +add_library(_curl_hostcheck OBJECT ${CASS_SRC_DIR}/third_party/curl/hostcheck.cpp) +add_library(_hdr_histogram OBJECT ${CASS_SRC_DIR}/third_party/hdr_histogram/hdr_histogram.cpp) +add_library(_http-parser OBJECT ${CASS_SRC_DIR}/third_party/http-parser/http_parser.c) +add_library(_minizip OBJECT ${CASS_SRC_DIR}/third_party/minizip/ioapi.c ${CASS_SRC_DIR}/third_party/minizip/zip.c ${CASS_SRC_DIR}/third_party/minizip/unzip.c) -target_link_libraries(minizip ch_contrib::zlib) -target_compile_definitions(minizip PRIVATE "-Dz_crc_t=unsigned long") +target_link_libraries(_minizip ch_contrib::zlib) +target_compile_definitions(_minizip PRIVATE "-Dz_crc_t=unsigned long") list(APPEND INCLUDE_DIRS ${CASS_SRC_DIR}/third_party/curl @@ -121,10 +121,10 @@ configure_file( add_library(_cassandra ${SOURCES} - $ - $ - $ - $) + $ + $ + $ + $) target_link_libraries(_cassandra ch_contrib::zlib) target_include_directories(_cassandra PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${INCLUDE_DIRS}) diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index d252ddc864f..f1ef9b53f7d 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -14,12 +14,12 @@ set (SRCS "${LIBRARY_DIR}/src/zone_info_source.cc" ) -add_library (cctz ${SRCS}) -target_include_directories (cctz PUBLIC "${LIBRARY_DIR}/include") +add_library (_cctz ${SRCS}) +target_include_directories (_cctz PUBLIC "${LIBRARY_DIR}/include") if (OS_FREEBSD) # yes, need linux, because bsd check inside linux in time_zone_libc.cc:24 - target_compile_definitions (cctz PRIVATE __USE_BSD linux _XOPEN_SOURCE=600) + target_compile_definitions (_cctz PRIVATE __USE_BSD linux _XOPEN_SOURCE=600) endif () # Related to time_zones table: @@ -57,7 +57,7 @@ clickhouse_embed_binaries( RESOURCE_DIR "${TZDIR}" RESOURCES ${TIMEZONE_RESOURCE_FILES} ) -add_dependencies(cctz tzdata) -target_link_libraries(cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") +add_dependencies(_cctz tzdata) +target_link_libraries(_cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") -add_library(ch_contrib::cctz ALIAS cctz) +add_library(ch_contrib::cctz ALIAS _cctz) diff --git a/contrib/cld2 b/contrib/cld2 new file mode 160000 index 00000000000..bc6d493a2f6 --- /dev/null +++ b/contrib/cld2 @@ -0,0 +1 @@ +Subproject commit bc6d493a2f64ed1fc1c4c4b4294a542a04e04217 diff --git a/contrib/cld2-cmake/CMakeLists.txt b/contrib/cld2-cmake/CMakeLists.txt new file mode 100644 index 00000000000..8600856ea36 --- /dev/null +++ b/contrib/cld2-cmake/CMakeLists.txt @@ -0,0 +1,33 @@ +set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2") + +set (SRCS + "${LIBRARY_DIR}/internal/cldutil.cc" + "${LIBRARY_DIR}/internal/compact_lang_det.cc" + "${LIBRARY_DIR}/internal/cldutil_shared.cc" + "${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc" + "${LIBRARY_DIR}/internal/compact_lang_det_impl.cc" + "${LIBRARY_DIR}/internal/debug.cc" + "${LIBRARY_DIR}/internal/fixunicodevalue.cc" + "${LIBRARY_DIR}/internal/generated_entities.cc" + "${LIBRARY_DIR}/internal/generated_language.cc" + "${LIBRARY_DIR}/internal/generated_ulscript.cc" + "${LIBRARY_DIR}/internal/getonescriptspan.cc" + "${LIBRARY_DIR}/internal/lang_script.cc" + "${LIBRARY_DIR}/internal/offsetmap.cc" + "${LIBRARY_DIR}/internal/scoreonescriptspan.cc" + "${LIBRARY_DIR}/internal/tote.cc" + "${LIBRARY_DIR}/internal/utf8statetable.cc" + "${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc" + "${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc" + "${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc" + "${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc" + "${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc" + "${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc" + "${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc" + "${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc" +) +add_library(_cld2 ${SRCS}) +set_property(TARGET _cld2 PROPERTY POSITION_INDEPENDENT_CODE ON) +target_compile_options (_cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing) +target_include_directories(_cld2 SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/public") +add_library(ch_contrib::cld2 ALIAS _cld2) diff --git a/contrib/consistent-hashing/CMakeLists.txt b/contrib/consistent-hashing/CMakeLists.txt index 7543022df46..5d979824434 100644 --- a/contrib/consistent-hashing/CMakeLists.txt +++ b/contrib/consistent-hashing/CMakeLists.txt @@ -1,2 +1,3 @@ -add_library(consistent-hashing consistent_hashing.cpp popcount.cpp) -target_include_directories(consistent-hashing SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +add_library(_consistent_hashing consistent_hashing.cpp popcount.cpp) +target_include_directories(_consistent_hashing SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +add_library(ch_contrib::consistent_hashing ALIAS _consistent_hashing) diff --git a/contrib/croaring-cmake/CMakeLists.txt b/contrib/croaring-cmake/CMakeLists.txt index 3d327d068c1..0bb7d0bd221 100644 --- a/contrib/croaring-cmake/CMakeLists.txt +++ b/contrib/croaring-cmake/CMakeLists.txt @@ -19,15 +19,15 @@ set(SRCS "${LIBRARY_DIR}/src/roaring_priority_queue.c" "${LIBRARY_DIR}/src/roaring_array.c") -add_library(roaring ${SRCS}) +add_library(_roaring ${SRCS}) -target_include_directories(roaring PRIVATE "${LIBRARY_DIR}/include/roaring") -target_include_directories(roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include") -target_include_directories(roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/cpp") +target_include_directories(_roaring PRIVATE "${LIBRARY_DIR}/include/roaring") +target_include_directories(_roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include") +target_include_directories(_roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/cpp") # We redirect malloc/free family of functions to different functions that will track memory in ClickHouse. # Also note that we exploit implicit function declarations. -target_compile_definitions(roaring PRIVATE +target_compile_definitions(_roaring PRIVATE -Dmalloc=clickhouse_malloc -Dcalloc=clickhouse_calloc -Drealloc=clickhouse_realloc @@ -35,4 +35,6 @@ target_compile_definitions(roaring PRIVATE -Dfree=clickhouse_free -Dposix_memalign=clickhouse_posix_memalign) -target_link_libraries(roaring PUBLIC clickhouse_common_io) +target_link_libraries(_roaring PUBLIC clickhouse_common_io) + +add_library(ch_contrib::roaring ALIAS _roaring) diff --git a/contrib/curl-cmake/CMakeLists.txt b/contrib/curl-cmake/CMakeLists.txt index 0fe5282b98e..589f40384e3 100644 --- a/contrib/curl-cmake/CMakeLists.txt +++ b/contrib/curl-cmake/CMakeLists.txt @@ -147,35 +147,24 @@ set (SRCS "${LIBRARY_DIR}/lib/vssh/libssh.c" ) -add_library (curl ${SRCS}) +add_library (_curl ${SRCS}) -target_compile_definitions (curl PRIVATE +target_compile_definitions (_curl PRIVATE HAVE_CONFIG_H BUILDING_LIBCURL CURL_HIDDEN_SYMBOLS libcurl_EXPORTS OS="${CMAKE_SYSTEM_NAME}" ) -target_include_directories (curl SYSTEM PUBLIC +target_include_directories (_curl SYSTEM PUBLIC "${LIBRARY_DIR}/include" "${LIBRARY_DIR}/lib" . # curl_config.h ) -target_link_libraries (curl PRIVATE ssl) +target_link_libraries (_curl PRIVATE OpenSSL::SSL) # The library is large - avoid bloat (XXX: is it?) -target_compile_options (curl PRIVATE -g0) +target_compile_options (_curl PRIVATE -g0) -# find_package(CURL) compatibility for the following packages that uses -# find_package(CURL)/include(FindCURL): -# - sentry-native -set (CURL_FOUND ON CACHE BOOL "") -set (CURL_ROOT_DIR ${LIBRARY_DIR} CACHE PATH "") -set (CURL_INCLUDE_DIR "${LIBRARY_DIR}/include" CACHE PATH "") -set (CURL_INCLUDE_DIRS "${LIBRARY_DIR}/include" CACHE PATH "") -set (CURL_LIBRARY curl CACHE STRING "") -set (CURL_LIBRARIES ${CURL_LIBRARY} CACHE STRING "") -set (CURL_VERSION_STRING 7.67.0 CACHE STRING "") -# add_library (CURL::libcurl ALIAS ${CURL_LIBRARY}) -add_library (ch_contrib::curl ALIAS ${CURL_LIBRARY}) +add_library (ch_contrib::curl ALIAS _curl) diff --git a/contrib/cyrus-sasl-cmake/CMakeLists.txt b/contrib/cyrus-sasl-cmake/CMakeLists.txt index 377af599d43..41deaae19a7 100644 --- a/contrib/cyrus-sasl-cmake/CMakeLists.txt +++ b/contrib/cyrus-sasl-cmake/CMakeLists.txt @@ -12,9 +12,9 @@ endif() set(CYRUS_SASL_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/cyrus-sasl") -add_library(sasl2) +add_library(_sasl2) -target_sources(sasl2 PRIVATE +target_sources(_sasl2 PRIVATE "${CYRUS_SASL_SOURCE_DIR}/plugins/gssapi.c" # "${CYRUS_SASL_SOURCE_DIR}/plugins/gssapiv2_init.c" "${CYRUS_SASL_SOURCE_DIR}/common/plugin_common.c" @@ -32,11 +32,11 @@ target_sources(sasl2 PRIVATE "${CYRUS_SASL_SOURCE_DIR}/lib/checkpw.c" ) -target_include_directories(sasl2 PUBLIC +target_include_directories(_sasl2 PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ) -target_include_directories(sasl2 PRIVATE +target_include_directories(_sasl2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} # for config.h "${CYRUS_SASL_SOURCE_DIR}/plugins" ${CYRUS_SASL_SOURCE_DIR} @@ -50,7 +50,7 @@ target_include_directories(sasl2 PRIVATE "${CYRUS_SASL_SOURCE_DIR}/tests" ) -target_compile_definitions(sasl2 PUBLIC +target_compile_definitions(_sasl2 PUBLIC HAVE_CONFIG_H # PLUGINDIR="/usr/local/lib/sasl2" PLUGINDIR="" @@ -76,6 +76,6 @@ file(COPY DESTINATION ${CMAKE_CURRENT_BINARY_DIR} ) -target_link_libraries(sasl2 PUBLIC ch_contrib::krb5) +target_link_libraries(_sasl2 PUBLIC ch_contrib::krb5) -add_library(ch_contrib::sasl2 ALIAS sasl2) +add_library(ch_contrib::sasl2 ALIAS _sasl2) diff --git a/contrib/dragonbox-cmake/CMakeLists.txt b/contrib/dragonbox-cmake/CMakeLists.txt index 604394c6dce..6644ac3c313 100644 --- a/contrib/dragonbox-cmake/CMakeLists.txt +++ b/contrib/dragonbox-cmake/CMakeLists.txt @@ -1,5 +1,5 @@ set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/dragonbox") -add_library(dragonbox_to_chars "${LIBRARY_DIR}/source/dragonbox_to_chars.cpp") - -target_include_directories(dragonbox_to_chars SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include/") +add_library(_dragonbox_to_chars "${LIBRARY_DIR}/source/dragonbox_to_chars.cpp") +target_include_directories(_dragonbox_to_chars SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include/") +add_library(ch_contrib::dragonbox_to_chars ALIAS _dragonbox_to_chars) diff --git a/contrib/fast_float-cmake/CMakeLists.txt b/contrib/fast_float-cmake/CMakeLists.txt index 77fad935c9e..4ddd11c6d37 100644 --- a/contrib/fast_float-cmake/CMakeLists.txt +++ b/contrib/fast_float-cmake/CMakeLists.txt @@ -1,3 +1,3 @@ -add_library(fast_float INTERFACE) -target_include_directories(fast_float SYSTEM BEFORE INTERFACE "${ClickHouse_SOURCE_DIR}/contrib/fast_float/include/") -add_library(ch_contrib::fast_float ALIAS fast_float) +add_library(_fast_float INTERFACE) +target_include_directories(_fast_float SYSTEM BEFORE INTERFACE "${ClickHouse_SOURCE_DIR}/contrib/fast_float/include/") +add_library(ch_contrib::fast_float ALIAS _fast_float) diff --git a/contrib/fastops-cmake/CMakeLists.txt b/contrib/fastops-cmake/CMakeLists.txt index f0e5030a3aa..17d6a7f5fcb 100644 --- a/contrib/fastops-cmake/CMakeLists.txt +++ b/contrib/fastops-cmake/CMakeLists.txt @@ -25,8 +25,8 @@ endif() set (SRCS ${SRCS} "${LIBRARY_DIR}/fastops/plain/ops_plain.cpp" "${LIBRARY_DIR}/fastops/core/avx_id.cpp" "${LIBRARY_DIR}/fastops/fastops.cpp") -add_library(fastops ${SRCS}) +add_library(_fastops ${SRCS}) -target_include_directories(fastops SYSTEM PUBLIC "${LIBRARY_DIR}") +target_include_directories(_fastops SYSTEM PUBLIC "${LIBRARY_DIR}") -add_library(ch_contrib::fastops ALIAS fastops) +add_library(ch_contrib::fastops ALIAS _fastops) diff --git a/contrib/fmtlib-cmake/CMakeLists.txt b/contrib/fmtlib-cmake/CMakeLists.txt index d279360692d..d8cb721b9ba 100644 --- a/contrib/fmtlib-cmake/CMakeLists.txt +++ b/contrib/fmtlib-cmake/CMakeLists.txt @@ -16,6 +16,6 @@ set (SRCS ../fmtlib/include/fmt/ranges.h ) -add_library(fmt ${SRCS}) -target_include_directories(fmt SYSTEM PUBLIC ../fmtlib/include) -add_library(ch_contrib::fmt ALIAS fmt) +add_library(_fmt ${SRCS}) +target_include_directories(_fmt SYSTEM PUBLIC ../fmtlib/include) +add_library(ch_contrib::fmt ALIAS _fmt) diff --git a/contrib/grpc-cmake/CMakeLists.txt b/contrib/grpc-cmake/CMakeLists.txt index 15b7550e810..520e04d198e 100644 --- a/contrib/grpc-cmake/CMakeLists.txt +++ b/contrib/grpc-cmake/CMakeLists.txt @@ -46,7 +46,7 @@ set(_gRPC_SSL_LIBRARIES OpenSSL::Crypto OpenSSL::SSL) set(gRPC_ABSL_PROVIDER "clickhouse" CACHE STRING "" FORCE) # Choose to build static or shared library for c-ares. -if (MAKE_STATIC_LIBRARIES) +if (USE_STATIC_LIBRARIES) set(CARES_STATIC ON CACHE BOOL "" FORCE) set(CARES_SHARED OFF CACHE BOOL "" FORCE) else () diff --git a/contrib/h3-cmake/CMakeLists.txt b/contrib/h3-cmake/CMakeLists.txt index ea9432bed8f..984d1b1ae7c 100644 --- a/contrib/h3-cmake/CMakeLists.txt +++ b/contrib/h3-cmake/CMakeLists.txt @@ -30,12 +30,12 @@ set(SRCS configure_file("${H3_SOURCE_DIR}/include/h3api.h.in" "${H3_BINARY_DIR}/include/h3api.h") -add_library(h3 ${SRCS}) -target_include_directories(h3 SYSTEM PUBLIC "${H3_SOURCE_DIR}/include") -target_include_directories(h3 SYSTEM PUBLIC "${H3_BINARY_DIR}/include") -target_compile_definitions(h3 PRIVATE H3_HAVE_VLA) +add_library(_h3 ${SRCS}) +target_include_directories(_h3 SYSTEM PUBLIC "${H3_SOURCE_DIR}/include") +target_include_directories(_h3 SYSTEM PUBLIC "${H3_BINARY_DIR}/include") +target_compile_definitions(_h3 PRIVATE H3_HAVE_VLA) if(M_LIBRARY) - target_link_libraries(h3 PRIVATE ${M_LIBRARY}) + target_link_libraries(_h3 PRIVATE ${M_LIBRARY}) endif() -add_library(ch_contrib::h3 ALIAS h3) +add_library(ch_contrib::h3 ALIAS _h3) diff --git a/contrib/hyperscan-cmake/CMakeLists.txt b/contrib/hyperscan-cmake/CMakeLists.txt index 19978687750..02c823a3a42 100644 --- a/contrib/hyperscan-cmake/CMakeLists.txt +++ b/contrib/hyperscan-cmake/CMakeLists.txt @@ -217,23 +217,23 @@ set (SRCS "${LIBRARY_DIR}/src/util/ue2string.cpp" ) -add_library (hyperscan ${SRCS}) +add_library (_hyperscan ${SRCS}) -target_compile_options (hyperscan +target_compile_options (_hyperscan PRIVATE -g0 # Library has too much debug information -mno-avx -mno-avx2 # The library is using dynamic dispatch and is confused if AVX is enabled globally -march=corei7 -O2 -fno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden # The options from original build system -fno-sanitize=undefined # Assume the library takes care of itself ) -target_include_directories (hyperscan +target_include_directories (_hyperscan PRIVATE common "${LIBRARY_DIR}/include" ) -target_include_directories (hyperscan SYSTEM PUBLIC "${LIBRARY_DIR}/src") +target_include_directories (_hyperscan SYSTEM PUBLIC "${LIBRARY_DIR}/src") if (ARCH_AMD64) - target_include_directories (hyperscan PRIVATE x86_64) + target_include_directories (_hyperscan PRIVATE x86_64) endif () -target_link_libraries (hyperscan PRIVATE boost::headers_only) +target_link_libraries (_hyperscan PRIVATE boost::headers_only) -add_library (ch_contrib::hyperscan ALIAS hyperscan) +add_library (ch_contrib::hyperscan ALIAS _hyperscan) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index afb6ec06b55..b3845c7d56b 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -87,9 +87,9 @@ if (OS_DARWIN) list(APPEND SRCS "${LIBRARY_DIR}/src/zone.c") endif () -add_library(jemalloc ${SRCS}) -target_include_directories(jemalloc PRIVATE "${LIBRARY_DIR}/include") -target_include_directories(jemalloc SYSTEM PUBLIC include) +add_library(_jemalloc ${SRCS}) +target_include_directories(_jemalloc PRIVATE "${LIBRARY_DIR}/include") +target_include_directories(_jemalloc SYSTEM PUBLIC include) set (JEMALLOC_INCLUDE_PREFIX) # OS_ @@ -117,24 +117,24 @@ endif () configure_file(${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h.in ${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h) -target_include_directories(jemalloc SYSTEM PRIVATE +target_include_directories(_jemalloc SYSTEM PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal") -target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE) +target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE) if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") - target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_DEBUG=1) + target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_DEBUG=1) endif () -target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_PROF=1) +target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_PROF=1) if (USE_UNWIND) - target_compile_definitions (jemalloc PRIVATE -DJEMALLOC_PROF_LIBUNWIND=1) - target_link_libraries (jemalloc PRIVATE unwind) + target_compile_definitions (_jemalloc PRIVATE -DJEMALLOC_PROF_LIBUNWIND=1) + target_link_libraries (_jemalloc PRIVATE unwind) endif () -target_compile_options(jemalloc PRIVATE -Wno-redundant-decls) +target_compile_options(_jemalloc PRIVATE -Wno-redundant-decls) # for RTLD_NEXT -target_compile_options(jemalloc PRIVATE -D_GNU_SOURCE) +target_compile_options(_jemalloc PRIVATE -D_GNU_SOURCE) -add_library(ch_contrib::jemalloc ALIAS jemalloc) +add_library(ch_contrib::jemalloc ALIAS _jemalloc) diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index 32ceecb889c..685e8737ef0 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -558,10 +558,10 @@ add_custom_target( VERBATIM ) -add_library(krb5) +add_library(_krb5) add_dependencies( - krb5 + _krb5 ERRMAP_H ERROR_MAP_H KRB_5_H @@ -579,7 +579,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin") list(APPEND ALL_SRCS "${CMAKE_CURRENT_BINARY_DIR}/include_private/kcmrpc.c") endif() -target_sources(krb5 PRIVATE +target_sources(_krb5 PRIVATE ${ALL_SRCS} ) @@ -651,12 +651,12 @@ add_custom_command( -target_include_directories(krb5 SYSTEM BEFORE PUBLIC +target_include_directories(_krb5 SYSTEM BEFORE PUBLIC "${KRB5_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" ) -target_include_directories(krb5 PRIVATE +target_include_directories(_krb5 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include_private" # For autoconf.h and other generated headers. ${KRB5_SOURCE_DIR} "${KRB5_SOURCE_DIR}/include" @@ -678,7 +678,7 @@ target_include_directories(krb5 PRIVATE "${KRB5_SOURCE_DIR}/lib/krb5/os" ) -target_compile_definitions(krb5 PRIVATE +target_compile_definitions(_krb5 PRIVATE KRB5_PRIVATE _GSS_STATIC_LINK=1 KRB5_DEPRECATED=1 @@ -688,6 +688,6 @@ target_compile_definitions(krb5 PRIVATE LIBDIR="/usr/local/lib" ) -target_link_libraries(krb5 PRIVATE OpenSSL::Crypto OpenSSL::SSL) +target_link_libraries(_krb5 PRIVATE OpenSSL::Crypto OpenSSL::SSL) -add_library(ch_contrib::krb5 ALIAS krb5) +add_library(ch_contrib::krb5 ALIAS _krb5) diff --git a/contrib/lemmagen-c-cmake/CMakeLists.txt b/contrib/lemmagen-c-cmake/CMakeLists.txt index 71abf07acc0..67e1e5791ca 100644 --- a/contrib/lemmagen-c-cmake/CMakeLists.txt +++ b/contrib/lemmagen-c-cmake/CMakeLists.txt @@ -5,6 +5,6 @@ set(SRCS "${LIBRARY_DIR}/src/RdrLemmatizer.cpp" ) -add_library(lemmagen STATIC ${SRCS}) -target_include_directories(lemmagen SYSTEM PUBLIC "${LEMMAGEN_INCLUDE_DIR}") -add_library(ch_contrib::lemmagen ALIAS lemmagen) +add_library(_lemmagen STATIC ${SRCS}) +target_include_directories(_lemmagen SYSTEM PUBLIC "${LEMMAGEN_INCLUDE_DIR}") +add_library(ch_contrib::lemmagen ALIAS _lemmagen) diff --git a/contrib/libcpuid-cmake/CMakeLists.txt b/contrib/libcpuid-cmake/CMakeLists.txt index 2b890579fbb..1940b39b6aa 100644 --- a/contrib/libcpuid-cmake/CMakeLists.txt +++ b/contrib/libcpuid-cmake/CMakeLists.txt @@ -23,12 +23,12 @@ set (SRCS "${LIBRARY_DIR}/libcpuid/recog_intel.c" ) -add_library (cpuid ${SRCS}) +add_library (_cpuid ${SRCS}) -target_include_directories (cpuid SYSTEM PUBLIC "${LIBRARY_DIR}") -target_compile_definitions (cpuid PRIVATE VERSION="v0.4.1") +target_include_directories (_cpuid SYSTEM PUBLIC "${LIBRARY_DIR}") +target_compile_definitions (_cpuid PRIVATE VERSION="v0.4.1") if (COMPILER_CLANG) - target_compile_options (cpuid PRIVATE -Wno-reserved-id-macro) + target_compile_options (_cpuid PRIVATE -Wno-reserved-id-macro) endif () -add_library(ch_contrib::cpuid ALIAS cpuid) +add_library(ch_contrib::cpuid ALIAS _cpuid) diff --git a/contrib/libdivide/CMakeLists.txt b/contrib/libdivide/CMakeLists.txt index 57e9f254db5..45cbc0a584b 100644 --- a/contrib/libdivide/CMakeLists.txt +++ b/contrib/libdivide/CMakeLists.txt @@ -1,2 +1,3 @@ -add_library (libdivide INTERFACE) -target_include_directories (libdivide SYSTEM BEFORE INTERFACE .) +add_library (_libdivide INTERFACE) +target_include_directories (_libdivide SYSTEM BEFORE INTERFACE .) +add_library (ch_contrib::libdivide ALIAS _libdivide) diff --git a/contrib/libgsasl-cmake/CMakeLists.txt b/contrib/libgsasl-cmake/CMakeLists.txt index 2b5baeeff0b..4bb4ca9dc33 100644 --- a/contrib/libgsasl-cmake/CMakeLists.txt +++ b/contrib/libgsasl-cmake/CMakeLists.txt @@ -98,19 +98,19 @@ if (TARGET ch_contrib::krb5) ${SRC_DIR}/gssapi/server.c) endif() -add_library(gsasl ${SRCS}) +add_library(_gsasl ${SRCS}) -target_include_directories(gsasl PUBLIC ${SRC_DIR}) -target_include_directories(gsasl PUBLIC ${SRC_DIR}/gl) -target_include_directories(gsasl PUBLIC ${SRC_DIR}/src) -target_include_directories(gsasl PUBLIC ${SRC_DIR}/digest-md5) -target_include_directories(gsasl PUBLIC "${ClickHouse_SOURCE_DIR}/contrib/libgsasl-cmake/linux_x86_64/include") +target_include_directories(_gsasl PUBLIC ${SRC_DIR}) +target_include_directories(_gsasl PUBLIC ${SRC_DIR}/gl) +target_include_directories(_gsasl PUBLIC ${SRC_DIR}/src) +target_include_directories(_gsasl PUBLIC ${SRC_DIR}/digest-md5) +target_include_directories(_gsasl PUBLIC "${ClickHouse_SOURCE_DIR}/contrib/libgsasl-cmake/linux_x86_64/include") -target_compile_definitions (gsasl PRIVATE HAVE_CONFIG_H=1) +target_compile_definitions(_gsasl PRIVATE HAVE_CONFIG_H=1) if (TARGET ch_contrib::krb5) - target_link_libraries(gsasl PUBLIC ch_contrib::krb5) - target_compile_definitions (gsasl PRIVATE HAVE_GSSAPI_H=1 USE_GSSAPI=1) + target_link_libraries(_gsasl PUBLIC ch_contrib::krb5) + target_compile_definitions(_gsasl PRIVATE HAVE_GSSAPI_H=1 USE_GSSAPI=1) endif() -add_library(ch_contrib::gsasl ALIAS gsasl) +add_library(ch_contrib::gsasl ALIAS _gsasl) diff --git a/contrib/libmetrohash/CMakeLists.txt b/contrib/libmetrohash/CMakeLists.txt index 4ec5a58717d..9f7984acf8b 100644 --- a/contrib/libmetrohash/CMakeLists.txt +++ b/contrib/libmetrohash/CMakeLists.txt @@ -2,5 +2,6 @@ set (SRCS src/metrohash64.cpp src/metrohash128.cpp ) -add_library(metrohash ${SRCS}) -target_include_directories(metrohash PUBLIC src) +add_library(_metrohash ${SRCS}) +target_include_directories(_metrohash PUBLIC src) +add_library(ch_contrib::metrohash ALIAS _metrohash) diff --git a/contrib/libpq-cmake/CMakeLists.txt b/contrib/libpq-cmake/CMakeLists.txt index a1ffa632231..280c0381393 100644 --- a/contrib/libpq-cmake/CMakeLists.txt +++ b/contrib/libpq-cmake/CMakeLists.txt @@ -57,12 +57,12 @@ set(SRCS "${LIBPQ_SOURCE_DIR}/port/explicit_bzero.c" ) -add_library(libpq ${SRCS}) +add_library(_libpq ${SRCS}) -target_include_directories (libpq SYSTEM PUBLIC ${LIBPQ_SOURCE_DIR}) -target_include_directories (libpq SYSTEM PUBLIC "${LIBPQ_SOURCE_DIR}/include") -target_include_directories (libpq SYSTEM PRIVATE "${LIBPQ_SOURCE_DIR}/configs") +target_include_directories (_libpq SYSTEM PUBLIC ${LIBPQ_SOURCE_DIR}) +target_include_directories (_libpq SYSTEM PUBLIC "${LIBPQ_SOURCE_DIR}/include") +target_include_directories (_libpq SYSTEM PRIVATE "${LIBPQ_SOURCE_DIR}/configs") -target_link_libraries (libpq PRIVATE ssl) +target_link_libraries (_libpq PRIVATE OpenSSL::SSL) -add_library(ch_contrib::libpq ALIAS libpq) +add_library(ch_contrib::libpq ALIAS _libpq) diff --git a/contrib/libpqxx-cmake/CMakeLists.txt b/contrib/libpqxx-cmake/CMakeLists.txt index 5462c3cdccf..a3317404f95 100644 --- a/contrib/libpqxx-cmake/CMakeLists.txt +++ b/contrib/libpqxx-cmake/CMakeLists.txt @@ -70,9 +70,9 @@ set (HDRS "${LIBRARY_DIR}/include/pqxx/zview.hxx" ) -add_library(libpqxx ${SRCS} ${HDRS}) +add_library(_libpqxx ${SRCS} ${HDRS}) -target_link_libraries(libpqxx PUBLIC ch_contrib::libpq) -target_include_directories (libpqxx SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include") +target_link_libraries(_libpqxx PUBLIC ch_contrib::libpq) +target_include_directories (_libpqxx SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include") -add_library(ch_contrib::libpqxx ALIAS libpqxx) +add_library(ch_contrib::libpqxx ALIAS _libpqxx) diff --git a/contrib/libstemmer-c-cmake/CMakeLists.txt b/contrib/libstemmer-c-cmake/CMakeLists.txt index f47ff91045e..6dc65c56c6c 100644 --- a/contrib/libstemmer-c-cmake/CMakeLists.txt +++ b/contrib/libstemmer-c-cmake/CMakeLists.txt @@ -27,6 +27,6 @@ FOREACH ( LINE ${_CONTENT} ) endforeach () # all the sources parsed. Now just add the lib -add_library ( stemmer STATIC ${_SOURCES} ${_HEADERS} ) -target_include_directories (stemmer SYSTEM PUBLIC "${STEMMER_INCLUDE_DIR}") -add_library(ch_contrib::stemmer ALIAS stemmer) +add_library(_stemmer STATIC ${_SOURCES} ${_HEADERS} ) +target_include_directories(_stemmer SYSTEM PUBLIC "${STEMMER_INCLUDE_DIR}") +add_library(ch_contrib::stemmer ALIAS _stemmer) diff --git a/contrib/libuv-cmake/CMakeLists.txt b/contrib/libuv-cmake/CMakeLists.txt index 3c06bdcf6d0..45f6d8e2083 100644 --- a/contrib/libuv-cmake/CMakeLists.txt +++ b/contrib/libuv-cmake/CMakeLists.txt @@ -133,7 +133,7 @@ add_library(ch_contrib::uv ALIAS _uv) target_compile_definitions(_uv PRIVATE ${uv_defines}) target_include_directories(_uv SYSTEM PUBLIC ${SOURCE_DIR}/include PRIVATE ${SOURCE_DIR}/src) target_link_libraries(_uv ${uv_libraries}) -if (NOT MAKE_STATIC_LIBRARIES) +if (NOT USE_STATIC_LIBRARIES) target_compile_definitions(_uv INTERFACE USING_UV_SHARED=1 PRIVATE BUILDING_UV_SHARED=1) diff --git a/contrib/libxml2-cmake/CMakeLists.txt b/contrib/libxml2-cmake/CMakeLists.txt index 2db16cf53e7..e9c4641c161 100644 --- a/contrib/libxml2-cmake/CMakeLists.txt +++ b/contrib/libxml2-cmake/CMakeLists.txt @@ -50,14 +50,14 @@ set(SRCS "${LIBXML2_SOURCE_DIR}/schematron.c" "${LIBXML2_SOURCE_DIR}/xzlib.c" ) -add_library(libxml2 ${SRCS}) +add_library(_libxml2 ${SRCS}) -target_link_libraries(libxml2 PRIVATE ch_contrib::zlib) +target_link_libraries(_libxml2 PRIVATE ch_contrib::zlib) if(M_LIBRARY) - target_link_libraries(libxml2 PRIVATE ${M_LIBRARY}) + target_link_libraries(_libxml2 PRIVATE ${M_LIBRARY}) endif() -target_include_directories(libxml2 BEFORE PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include") -target_include_directories(libxml2 BEFORE PUBLIC "${LIBXML2_SOURCE_DIR}/include") +target_include_directories(_libxml2 BEFORE PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include") +target_include_directories(_libxml2 BEFORE PUBLIC "${LIBXML2_SOURCE_DIR}/include") -add_library(ch_contrib::libxml2 ALIAS libxml2) +add_library(ch_contrib::libxml2 ALIAS _libxml2) diff --git a/contrib/lz4 b/contrib/lz4 index f39b79fb029..4c9431e9af5 160000 --- a/contrib/lz4 +++ b/contrib/lz4 @@ -1 +1 @@ -Subproject commit f39b79fb02962a1cd880bbdecb6dffba4f754a11 +Subproject commit 4c9431e9af596af0556e5da0ae99305bafb2b10b diff --git a/contrib/magic-enum-cmake/CMakeLists.txt b/contrib/magic-enum-cmake/CMakeLists.txt index 86f92d6c2b4..f1face02de1 100644 --- a/contrib/magic-enum-cmake/CMakeLists.txt +++ b/contrib/magic-enum-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/magic_enum") -add_library (magic_enum INTERFACE) -target_include_directories(magic_enum SYSTEM INTERFACE ${LIBRARY_DIR}/include) -add_library(ch_contrib::magic_enum ALIAS magic_enum) +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/magic_enum") +add_library(_magic_enum INTERFACE) +target_include_directories(_magic_enum SYSTEM INTERFACE ${LIBRARY_DIR}/include) +add_library(ch_contrib::magic_enum ALIAS _magic_enum) diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt index 5e337bc4621..1d9f750acee 100644 --- a/contrib/mariadb-connector-c-cmake/CMakeLists.txt +++ b/contrib/mariadb-connector-c-cmake/CMakeLists.txt @@ -239,12 +239,12 @@ endif() set(LIBMARIADB_SOURCES ${LIBMARIADB_SOURCES} ${CC_SOURCE_DIR}/libmariadb/mariadb_async.c ${CC_SOURCE_DIR}/libmariadb/ma_context.c) -add_library(mariadbclient STATIC ${LIBMARIADB_SOURCES}) -target_link_libraries(mariadbclient ${SYSTEM_LIBS}) +add_library(_mariadbclient STATIC ${LIBMARIADB_SOURCES}) +target_link_libraries(_mariadbclient ${SYSTEM_LIBS}) -target_include_directories(mariadbclient PRIVATE ${CC_BINARY_DIR}/include-private) -target_include_directories(mariadbclient SYSTEM PUBLIC ${CC_BINARY_DIR}/include-public ${CC_SOURCE_DIR}/include ${CC_SOURCE_DIR}/libmariadb) +target_include_directories(_mariadbclient PRIVATE ${CC_BINARY_DIR}/include-private) +target_include_directories(_mariadbclient SYSTEM PUBLIC ${CC_BINARY_DIR}/include-public ${CC_SOURCE_DIR}/include ${CC_SOURCE_DIR}/libmariadb) -set_target_properties(mariadbclient PROPERTIES IMPORTED_INTERFACE_LINK_LIBRARIES "${SYSTEM_LIBS}") +set_target_properties(_mariadbclient PROPERTIES IMPORTED_INTERFACE_LINK_LIBRARIES "${SYSTEM_LIBS}") -add_library(ch_contrib::mariadbclient ALIAS mariadbclient) +add_library(ch_contrib::mariadbclient ALIAS _mariadbclient) diff --git a/contrib/murmurhash/CMakeLists.txt b/contrib/murmurhash/CMakeLists.txt index 2d9cb3e6382..5b04974d3c5 100644 --- a/contrib/murmurhash/CMakeLists.txt +++ b/contrib/murmurhash/CMakeLists.txt @@ -1,7 +1,8 @@ -add_library(murmurhash +add_library(_murmurhash src/MurmurHash2.cpp src/MurmurHash3.cpp include/MurmurHash2.h include/MurmurHash3.h) -target_include_directories (murmurhash PUBLIC include) +target_include_directories(_murmurhash PUBLIC include) +add_library(ch_contrib::murmurhash ALIAS _murmurhash) diff --git a/contrib/nanodbc-cmake/CMakeLists.txt b/contrib/nanodbc-cmake/CMakeLists.txt index 8b5a70e65df..9ed6c9525b6 100644 --- a/contrib/nanodbc-cmake/CMakeLists.txt +++ b/contrib/nanodbc-cmake/CMakeLists.txt @@ -12,8 +12,7 @@ set (SRCS "${LIBRARY_DIR}/nanodbc/nanodbc.cpp" ) -add_library(nanodbc ${SRCS}) - -target_link_libraries (nanodbc PUBLIC ch_contrib::unixodbc) -target_include_directories (nanodbc SYSTEM PUBLIC "${LIBRARY_DIR}/") -add_library(ch_contrib::nanodbc ALIAS nanodbc) +add_library(_nanodbc ${SRCS}) +target_link_libraries(_nanodbc PUBLIC ch_contrib::unixodbc) +target_include_directories(_nanodbc SYSTEM PUBLIC "${LIBRARY_DIR}/") +add_library(ch_contrib::nanodbc ALIAS _nanodbc) diff --git a/contrib/nlp-data b/contrib/nlp-data new file mode 160000 index 00000000000..5591f91f5e7 --- /dev/null +++ b/contrib/nlp-data @@ -0,0 +1 @@ +Subproject commit 5591f91f5e748cba8fb9ef81564176feae774853 diff --git a/contrib/nlp-data-cmake/CMakeLists.txt b/contrib/nlp-data-cmake/CMakeLists.txt new file mode 100644 index 00000000000..5380269c479 --- /dev/null +++ b/contrib/nlp-data-cmake/CMakeLists.txt @@ -0,0 +1,15 @@ +include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake) + +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nlp-data") + +add_library (_nlp_data INTERFACE) + +clickhouse_embed_binaries( + TARGET nlp_dictionaries + RESOURCE_DIR "${LIBRARY_DIR}" + RESOURCES charset.zst tonality_ru.zst programming.zst +) + +add_dependencies(_nlp_data nlp_dictionaries) +target_link_libraries(_nlp_data INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") +add_library(ch_contrib::nlp_data ALIAS _nlp_data) diff --git a/contrib/orc b/contrib/orc index 0a936f6bbdb..f9a393ed243 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit 0a936f6bbdb9303308973073f8623b5a8d82eae1 +Subproject commit f9a393ed2433a60034795284f82d093b348f2102 diff --git a/contrib/poco-cmake/Crypto/CMakeLists.txt b/contrib/poco-cmake/Crypto/CMakeLists.txt index 3c410095d6e..9886a05b21b 100644 --- a/contrib/poco-cmake/Crypto/CMakeLists.txt +++ b/contrib/poco-cmake/Crypto/CMakeLists.txt @@ -29,7 +29,7 @@ if (ENABLE_SSL) target_compile_options (_poco_crypto PRIVATE -Wno-newline-eof) target_include_directories (_poco_crypto SYSTEM PUBLIC "${LIBRARY_DIR}/Crypto/include") - target_link_libraries (_poco_crypto PUBLIC Poco::Foundation ssl crypto) + target_link_libraries (_poco_crypto PUBLIC Poco::Foundation OpenSSL::SSL OpenSSL::Crypto) message (STATUS "Using Poco::Crypto") else () diff --git a/contrib/protobuf-cmake/CMakeLists.txt b/contrib/protobuf-cmake/CMakeLists.txt index b4f026ab074..5e22136fc1f 100644 --- a/contrib/protobuf-cmake/CMakeLists.txt +++ b/contrib/protobuf-cmake/CMakeLists.txt @@ -63,13 +63,13 @@ set(libprotobuf_lite_files ${protobuf_source_dir}/src/google/protobuf/wire_format_lite.cc ) -add_library(libprotobuf-lite ${libprotobuf_lite_files}) -target_link_libraries(libprotobuf-lite pthread) +add_library(_libprotobuf-lite ${libprotobuf_lite_files}) +target_link_libraries(_libprotobuf-lite pthread) if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") - target_link_libraries(libprotobuf-lite log) + target_link_libraries(_libprotobuf-lite log) endif() -target_include_directories(libprotobuf-lite SYSTEM PUBLIC ${protobuf_source_dir}/src) -add_library(protobuf::libprotobuf-lite ALIAS libprotobuf-lite) +target_include_directories(_libprotobuf-lite SYSTEM PUBLIC ${protobuf_source_dir}/src) +add_library(protobuf::libprotobuf-lite ALIAS _libprotobuf-lite) set(libprotobuf_files @@ -127,17 +127,17 @@ set(libprotobuf_files ${protobuf_source_dir}/src/google/protobuf/wrappers.pb.cc ) -add_library(libprotobuf ${libprotobuf_lite_files} ${libprotobuf_files}) +add_library(_libprotobuf ${libprotobuf_lite_files} ${libprotobuf_files}) if (ENABLE_FUZZING) - target_compile_options(libprotobuf PRIVATE "-fsanitize-recover=all") + target_compile_options(_libprotobuf PRIVATE "-fsanitize-recover=all") endif() -target_link_libraries(libprotobuf pthread) -target_link_libraries(libprotobuf ch_contrib::zlib) +target_link_libraries(_libprotobuf pthread) +target_link_libraries(_libprotobuf ch_contrib::zlib) if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") - target_link_libraries(libprotobuf log) + target_link_libraries(_libprotobuf log) endif() -target_include_directories(libprotobuf SYSTEM PUBLIC ${protobuf_source_dir}/src) -add_library(protobuf::libprotobuf ALIAS libprotobuf) +target_include_directories(_libprotobuf SYSTEM PUBLIC ${protobuf_source_dir}/src) +add_library(protobuf::libprotobuf ALIAS _libprotobuf) set(libprotoc_files @@ -226,9 +226,9 @@ set(libprotoc_files ${protobuf_source_dir}/src/google/protobuf/compiler/zip_writer.cc ) -add_library(libprotoc ${libprotoc_files}) -target_link_libraries(libprotoc libprotobuf) -add_library(protobuf::libprotoc ALIAS libprotoc) +add_library(_libprotoc ${libprotoc_files}) +target_link_libraries(_libprotoc _libprotobuf) +add_library(protobuf::libprotoc ALIAS _libprotoc) set(protoc_files ${protobuf_source_dir}/src/google/protobuf/compiler/main.cc) @@ -236,7 +236,7 @@ if (CMAKE_HOST_SYSTEM_NAME STREQUAL CMAKE_SYSTEM_NAME AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR) add_executable(protoc ${protoc_files}) - target_link_libraries(protoc libprotoc libprotobuf pthread) + target_link_libraries(protoc _libprotoc _libprotobuf pthread) add_executable(protobuf::protoc ALIAS protoc) if (ENABLE_FUZZING) @@ -319,11 +319,11 @@ endif () include("${ClickHouse_SOURCE_DIR}/contrib/protobuf-cmake/protobuf_generate.cmake") add_library(_protobuf INTERFACE) -target_link_libraries(_protobuf INTERFACE libprotobuf) +target_link_libraries(_protobuf INTERFACE _libprotobuf) target_include_directories(_protobuf INTERFACE "${Protobuf_INCLUDE_DIR}") add_library(ch_contrib::protobuf ALIAS _protobuf) add_library(_protoc INTERFACE) -target_link_libraries(_protoc INTERFACE libprotoc libprotobuf) +target_link_libraries(_protoc INTERFACE _libprotoc _libprotobuf) target_include_directories(_protoc INTERFACE "${Protobuf_INCLUDE_DIR}") add_library(ch_contrib::protoc ALIAS _protoc) diff --git a/contrib/re2-cmake/CMakeLists.txt b/contrib/re2-cmake/CMakeLists.txt index f348f412f62..e74f488643d 100644 --- a/contrib/re2-cmake/CMakeLists.txt +++ b/contrib/re2-cmake/CMakeLists.txt @@ -71,5 +71,7 @@ foreach (FILENAME mutex.h) add_dependencies (re2_st transform_${FILENAME}) endforeach () +# NOTE: you should not change name of library here, since it is used for PVS +# (see docker/test/pvs/Dockerfile), to generate required header (see above) add_library(ch_contrib::re2 ALIAS re2) add_library(ch_contrib::re2_st ALIAS re2_st) diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 575d044b1d7..8487ad520bc 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -20,11 +20,11 @@ set(SRCS "${LIBRARY_DIR}/src/wcwidth.cpp" ) -add_library (replxx ${SRCS}) -target_include_directories(replxx SYSTEM PUBLIC "${LIBRARY_DIR}/include") +add_library (_replxx ${SRCS}) +target_include_directories(_replxx SYSTEM PUBLIC "${LIBRARY_DIR}/include") if (COMPILER_CLANG) - target_compile_options(replxx PRIVATE -Wno-documentation) + target_compile_options(_replxx PRIVATE -Wno-documentation) endif () -add_library(ch_contrib::replxx ALIAS replxx) +add_library(ch_contrib::replxx ALIAS _replxx) diff --git a/contrib/sentry-native-cmake/CMakeLists.txt b/contrib/sentry-native-cmake/CMakeLists.txt index 18cbc9a2df8..520fa176b91 100644 --- a/contrib/sentry-native-cmake/CMakeLists.txt +++ b/contrib/sentry-native-cmake/CMakeLists.txt @@ -49,17 +49,16 @@ else() list(APPEND SRCS ${SRC_DIR}/src/modulefinder/sentry_modulefinder_linux.c) endif() -add_library(sentry ${SRCS}) -add_library(sentry::sentry ALIAS sentry) +add_library(_sentry ${SRCS}) if(BUILD_SHARED_LIBS) - target_compile_definitions(sentry PRIVATE SENTRY_BUILD_SHARED) + target_compile_definitions(_sentry PRIVATE SENTRY_BUILD_SHARED) else() - target_compile_definitions(sentry PUBLIC SENTRY_BUILD_STATIC) + target_compile_definitions(_sentry PUBLIC SENTRY_BUILD_STATIC) endif() -target_link_libraries(sentry PRIVATE ch_contrib::curl pthread) -target_include_directories(sentry PUBLIC "${SRC_DIR}/include" PRIVATE "${SRC_DIR}/src") -target_compile_definitions(sentry PRIVATE SENTRY_WITH_INPROC_BACKEND SIZEOF_LONG=8) +target_link_libraries(_sentry PRIVATE ch_contrib::curl pthread) +target_include_directories(_sentry PUBLIC "${SRC_DIR}/include" PRIVATE "${SRC_DIR}/src") +target_compile_definitions(_sentry PRIVATE SENTRY_WITH_INPROC_BACKEND SIZEOF_LONG=8) -add_library(ch_contrib::sentry ALIAS sentry) +add_library(ch_contrib::sentry ALIAS _sentry) diff --git a/contrib/sqlite-cmake/CMakeLists.txt b/contrib/sqlite-cmake/CMakeLists.txt index ea4c3b8e497..7559dd4c184 100644 --- a/contrib/sqlite-cmake/CMakeLists.txt +++ b/contrib/sqlite-cmake/CMakeLists.txt @@ -9,6 +9,6 @@ set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/sqlite-amalgamation") set(SRCS ${LIBRARY_DIR}/sqlite3.c) -add_library(sqlite ${SRCS}) -target_include_directories(sqlite SYSTEM PUBLIC "${LIBRARY_DIR}") -add_library(ch_contrib::sqlite ALIAS sqlite) +add_library(_sqlite ${SRCS}) +target_include_directories(_sqlite SYSTEM PUBLIC "${LIBRARY_DIR}") +add_library(ch_contrib::sqlite ALIAS _sqlite) diff --git a/contrib/stats-cmake/CMakeLists.txt b/contrib/stats-cmake/CMakeLists.txt deleted file mode 100644 index 8279e49c3f0..00000000000 --- a/contrib/stats-cmake/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# The stats is a header-only library of probability density functions, -# cumulative distribution functions, quantile functions, and random sampling methods. -set(STATS_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/stats/include") -set(GCEM_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/gcem/include") - -add_library(stats INTERFACE) - -target_include_directories(stats SYSTEM INTERFACE ${STATS_INCLUDE_DIR}) -target_include_directories(stats SYSTEM INTERFACE ${GCEM_INCLUDE_DIR}) diff --git a/contrib/unixodbc-cmake/CMakeLists.txt b/contrib/unixodbc-cmake/CMakeLists.txt index 9adf9ce6d79..b594ead3ba0 100644 --- a/contrib/unixodbc-cmake/CMakeLists.txt +++ b/contrib/unixodbc-cmake/CMakeLists.txt @@ -29,9 +29,9 @@ set (SRCS_LTDL "${LIBRARY_DIR}/libltdl/loaders/preopen.c" ) -add_library (ltdl ${SRCS_LTDL}) +add_library (_ltdl ${SRCS_LTDL}) -target_include_directories(ltdl +target_include_directories(_ltdl SYSTEM PRIVATE linux_x86_64/libltdl @@ -39,8 +39,8 @@ target_include_directories(ltdl "${LIBRARY_DIR}/libltdl" "${LIBRARY_DIR}/libltdl/libltdl" ) -target_compile_definitions(ltdl PRIVATE -DHAVE_CONFIG_H -DLTDL -DLTDLOPEN=libltdlc) -target_compile_options(ltdl PRIVATE -Wno-constant-logical-operand -Wno-unknown-warning-option -O2) +target_compile_definitions(_ltdl PRIVATE -DHAVE_CONFIG_H -DLTDL -DLTDLOPEN=libltdlc) +target_compile_options(_ltdl PRIVATE -Wno-constant-logical-operand -Wno-unknown-warning-option -O2) # odbc @@ -279,13 +279,13 @@ set (SRCS "${LIBRARY_DIR}/odbcinst/SQLWritePrivateProfileString.c" ) -add_library (unixodbc ${SRCS}) +add_library (_unixodbc ${SRCS}) -target_link_libraries (unixodbc PRIVATE ltdl) +target_link_libraries (_unixodbc PRIVATE _ltdl) # SYSTEM_FILE_PATH was changed to /etc -target_include_directories (unixodbc +target_include_directories (_unixodbc SYSTEM PRIVATE linux_x86_64/private @@ -293,8 +293,8 @@ target_include_directories (unixodbc linux_x86_64 "${LIBRARY_DIR}/include" ) -target_compile_definitions (unixodbc PRIVATE -DHAVE_CONFIG_H) -target_compile_options (unixodbc +target_compile_definitions (_unixodbc PRIVATE -DHAVE_CONFIG_H) +target_compile_options (_unixodbc PRIVATE -Wno-dangling-else -Wno-parentheses @@ -304,4 +304,4 @@ target_compile_options (unixodbc -O2 ) -add_library (ch_contrib::unixodbc ALIAS unixodbc) +add_library (ch_contrib::unixodbc ALIAS _unixodbc) diff --git a/contrib/wordnet-blast-cmake/CMakeLists.txt b/contrib/wordnet-blast-cmake/CMakeLists.txt index ec4bdee48e5..40712ecd2c5 100644 --- a/contrib/wordnet-blast-cmake/CMakeLists.txt +++ b/contrib/wordnet-blast-cmake/CMakeLists.txt @@ -6,7 +6,7 @@ set(SRCS "${LIBRARY_DIR}/wnb/core/wordnet.cc" ) -add_library(wnb ${SRCS}) -target_link_libraries(wnb PRIVATE boost::headers_only boost::graph) -target_include_directories(wnb SYSTEM PUBLIC "${LIBRARY_DIR}") -add_library(ch_contrib::wnb ALIAS wnb) +add_library(_wnb ${SRCS}) +target_link_libraries(_wnb PRIVATE boost::headers_only boost::graph) +target_include_directories(_wnb SYSTEM PUBLIC "${LIBRARY_DIR}") +add_library(ch_contrib::wnb ALIAS _wnb) diff --git a/contrib/yaml-cpp-cmake/CMakeLists.txt b/contrib/yaml-cpp-cmake/CMakeLists.txt index 1681bfe4015..00e85f90932 100644 --- a/contrib/yaml-cpp-cmake/CMakeLists.txt +++ b/contrib/yaml-cpp-cmake/CMakeLists.txt @@ -39,9 +39,9 @@ set (SRCS ${LIBRARY_DIR}/src/scantag.cpp ) -add_library (yaml-cpp ${SRCS}) +add_library (_yaml_cpp ${SRCS}) -target_include_directories(yaml-cpp PRIVATE ${LIBRARY_DIR}/include/yaml-cpp) -target_include_directories(yaml-cpp SYSTEM BEFORE PUBLIC ${LIBRARY_DIR}/include) +target_include_directories(_yaml_cpp PRIVATE ${LIBRARY_DIR}/include/yaml-cpp) +target_include_directories(_yaml_cpp SYSTEM BEFORE PUBLIC ${LIBRARY_DIR}/include) -add_library (ch_contrib::yaml_cpp ALIAS yaml-cpp) +add_library (ch_contrib::yaml_cpp ALIAS _yaml_cpp) diff --git a/contrib/zlib-ng-cmake/CMakeLists.txt b/contrib/zlib-ng-cmake/CMakeLists.txt index 0b7564c6420..371a07dd31a 100644 --- a/contrib/zlib-ng-cmake/CMakeLists.txt +++ b/contrib/zlib-ng-cmake/CMakeLists.txt @@ -131,7 +131,6 @@ set(ZLIB_SRCS set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_SRCS}) add_library(_zlib ${ZLIB_ALL_SRCS}) -add_library(zlibstatic ALIAS _zlib) add_library(ch_contrib::zlib ALIAS _zlib) # https://github.com/zlib-ng/zlib-ng/pull/733 diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 103f2fdb47f..20ad0e03bfe 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -65,7 +65,12 @@ do # check if variable not empty [ -z "$dir" ] && continue # ensure directories exist - if ! mkdir -p "$dir"; then + if [ "$DO_CHOWN" = "1" ]; then + mkdir="mkdir" + else + mkdir="$gosu mkdir" + fi + if ! $mkdir -p "$dir"; then echo "Couldn't create necessary directory: $dir" exit 1 fi diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index cdc904f1e94..bcb026aa0dc 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -78,15 +78,21 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( | DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) | | DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | | DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) | +| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) | +| TIME | [Int64](../../sql-reference/data-types/int-uint.md) | | ENUM | [Enum](../../sql-reference/data-types/enum.md) | | STRING | [String](../../sql-reference/data-types/string.md) | | VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) | | BLOB | [String](../../sql-reference/data-types/string.md) | +| GEOMETRY | [String](../../sql-reference/data-types/string.md) | | BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) | | BIT | [UInt64](../../sql-reference/data-types/int-uint.md) | +| SET | [UInt64](../../sql-reference/data-types/int-uint.md) | [Nullable](../../sql-reference/data-types/nullable.md) is supported. +The data of TIME type in MySQL is converted to microseconds in ClickHouse. + Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws exception "Unhandled data type" and stops replication. ## Specifics and Recommendations {#specifics-and-recommendations} diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index a116c8e2222..8d1ff12cf0a 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -27,6 +27,7 @@ toc_title: Client Libraries - Go - [clickhouse](https://github.com/kshvakov/clickhouse/) - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [chconn](https://github.com/vahid-sohrabloo/chconn) - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) - Swift diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index b67e373be35..5efa1b971bc 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -105,10 +105,13 @@ toc_title: Adopters | MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) | | MUX | Online Video | Video Analytics | — | — | [Talk in English, August 2019](https://altinity.com/presentations/2019/8/13/how-clickhouse-became-the-default-analytics-database-for-mux/) | | MGID | Ad network | Web-analytics | — | — | [Blog post in Russian, April 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | +| Muse Group | Music Software | Performance Monitoring | — | — | [Blog post in Russian, January 2021](https://habr.com/en/post/647079/) | | Netskope | Network Security | — | — | — | [Job advertisement, March 2021](https://www.mendeley.com/careers/job/senior-software-developer-backend-developer-1346348) | | NIC Labs | Network Monitoring | RaTA-DNS | — | — | [Blog post, March 2021](https://niclabs.cl/ratadns/2021/03/Clickhouse) | +| NLMK | Steel | Monitoring | — | — | [Article in Russian, Jan 2022](https://habr.com/en/company/nlmk/blog/645943/) | | NOC Project | Network Monitoring | Analytics | Main Product | — | [Official Website](https://getnoc.com/features/big-data/) | | Noction | Network Technology | Main Product | — | — | [Official Website](https://www.noction.com/news/irp-3-11-remote-triggered-blackholing-capability) +| ntop | Network Monitoning | Monitoring | — | — | [Official website, Jan 2022](https://www.ntop.org/ntop/historical-traffic-analysis-at-scale-using-clickhouse-with-ntopng/) | | Nuna Inc. | Health Data Analytics | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=170) | | Ok.ru | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) | | Omnicomm | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) | @@ -190,5 +193,6 @@ toc_title: Adopters | Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | | ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | +| АС "Стрела" | Transportation | — | — | — | [Job posting, Jan 2022](https://vk.com/topic-111905078_35689124?post=3553) | [Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 477d3b52965..64e65575f3f 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -129,6 +129,10 @@ If you want to divide an existing ZooKeeper cluster into two, the correct way is Do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is very sensitive for latency and ClickHouse may utilize all available system resources. +You can have ZooKeeper observers in an ensemble but ClickHouse servers should not interact with observers. + +Do not change `minSessionTimeout` setting, large values may affect ClickHouse restart stability. + With the default settings, ZooKeeper is a time bomb: > The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index bee77a382d7..0d1c4535b28 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -159,7 +159,7 @@ Configuration fields: | Tag | Description | Required | |------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| | `name` | Column name. | Yes | -| `type` | ClickHouse data type: [UInt8](../../../sql-reference/data-types/int-uint.md), [UInt16](../../../sql-reference/data-types/int-uint.md), [UInt32](../../../sql-reference/data-types/int-uint.md), [UInt64](../../../sql-reference/data-types/int-uint.md), [Int8](../../../sql-reference/data-types/int-uint.md), [Int16](../../../sql-reference/data-types/int-uint.md), [Int32](../../../sql-reference/data-types/int-uint.md), [Int64](../../../sql-reference/data-types/int-uint.md), [Float32](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md), [UUID](../../../sql-reference/data-types/uuid.md), [Decimal32](../../../sql-reference/data-types/decimal.md), [Decimal64](../../../sql-reference/data-types/decimal.md), [Decimal128](../../../sql-reference/data-types/decimal.md), [Decimal256](../../../sql-reference/data-types/decimal.md), [String](../../../sql-reference/data-types/string.md), [Array](../../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) dictionaries. In [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes | +| `type` | ClickHouse data type: [UInt8](../../../sql-reference/data-types/int-uint.md), [UInt16](../../../sql-reference/data-types/int-uint.md), [UInt32](../../../sql-reference/data-types/int-uint.md), [UInt64](../../../sql-reference/data-types/int-uint.md), [Int8](../../../sql-reference/data-types/int-uint.md), [Int16](../../../sql-reference/data-types/int-uint.md), [Int32](../../../sql-reference/data-types/int-uint.md), [Int64](../../../sql-reference/data-types/int-uint.md), [Float32](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md), [UUID](../../../sql-reference/data-types/uuid.md), [Decimal32](../../../sql-reference/data-types/decimal.md), [Decimal64](../../../sql-reference/data-types/decimal.md), [Decimal128](../../../sql-reference/data-types/decimal.md), [Decimal256](../../../sql-reference/data-types/decimal.md),[Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md), [String](../../../sql-reference/data-types/string.md), [Array](../../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) dictionaries. In [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes | | `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../../syntax.md#null-literal) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | | `expression` | [Expression](../../../sql-reference/syntax.md#syntax-expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | | `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md).

Default value: `false`. | No | diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index 69dd14da1bf..ec1524f1fa3 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -93,6 +93,8 @@ For [String](../../sql-reference/data-types/string.md) and [FixedString](../../s Values of [Float](../../sql-reference/data-types/float.md) and [Decimal](../../sql-reference/data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. +Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order string. + **Arguments** - `arg` — A value to convert to hexadecimal. Types: [String](../../sql-reference/data-types/string.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md), [Decimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). @@ -147,6 +149,21 @@ Result: └──────────────────┘ ``` +Query: + +``` sql +SELECT lower(hex(toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0'))) as uuid_hex +``` + +Result: + +``` text +┌─uuid_hex─────────────────────────┐ +│ 61f0c4045cb311e7907ba6006ad3dba0 │ +└──────────────────────────────────┘ +``` + + ## unhex {#unhexstr} Performs the opposite operation of [hex](#hex). It interprets each pair of hexadecimal digits (in the argument) as a number and converts it to the byte represented by the number. The return value is a binary string (BLOB). @@ -224,6 +241,8 @@ For [String](../../sql-reference/data-types/string.md) and [FixedString](../../s Values of [Float](../../sql-reference/data-types/float.md) and [Decimal](../../sql-reference/data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. +Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order string. + **Arguments** - `arg` — A value to convert to binary. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md), [Decimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md), or [DateTime](../../sql-reference/data-types/datetime.md). @@ -280,6 +299,21 @@ Result: └──────────────────────────────────────────────────────────────────┘ ``` +Query: + +``` sql +SELECT bin(toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0')) as bin_uuid +``` + +Result: + +``` text +┌─bin_uuid─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ 01100001111100001100010000000100010111001011001100010001111001111001000001111011101001100000000001101010110100111101101110100000 │ +└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + + ## unbin {#unbinstr} Interprets each pair of binary digits (in the argument) as a number and converts it to the byte represented by the number. The functions performs the opposite operation to [bin](#bin). diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 1aa5a3b739a..ecbe00adfd7 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -156,6 +156,40 @@ Result: └─────────────┘ ``` +## h3EdgeLengthKm {#h3edgelengthkm} + +Calculates the average length of the [H3](#h3index) hexagon edge in kilometers. + +**Syntax** + +``` sql +h3EdgeLengthKm(resolution) +``` + +**Parameter** + +- `resolution` — Index resolution. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. + +**Returned values** + +- The average length of the [H3](#h3index) hexagon edge in kilometers. Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3EdgeLengthKm(15) AS edgeLengthKm; +``` + +Result: + +``` text +┌─edgeLengthKm─┐ +│ 0.000509713 │ +└──────────────┘ +``` + ## geoToH3 {#geotoh3} Returns [H3](#h3index) point index `(lon, lat)` with specified resolution. @@ -849,4 +883,147 @@ Result: └────────────────────┘ ``` +## h3ExactEdgeLengthM {#h3exactedgelengthm} + +Returns the exact edge length of the unidirectional edge represented by the input h3 index in meters. + +**Syntax** + +``` sql +h3ExactEdgeLengthM(index) +``` + +**Parameter** + +- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Exact edge length in meters. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3ExactEdgeLengthM(1310277011704381439) AS exactEdgeLengthM;; +``` + +Result: + +``` text +┌───exactEdgeLengthM─┐ +│ 195449.63163407316 │ +└────────────────────┘ +``` + +## h3ExactEdgeLengthKm {#h3exactedgelengthkm} + +Returns the exact edge length of the unidirectional edge represented by the input h3 index in kilometers. + +**Syntax** + +``` sql +h3ExactEdgeLengthKm(index) +``` + +**Parameter** + +- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Exact edge length in kilometers. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3ExactEdgeLengthKm(1310277011704381439) AS exactEdgeLengthKm;; +``` + +Result: + +``` text +┌──exactEdgeLengthKm─┐ +│ 195.44963163407317 │ +└────────────────────┘ +``` + +## h3ExactEdgeLengthRads {#h3exactedgelengthrads} + +Returns the exact edge length of the unidirectional edge represented by the input h3 index in radians. + +**Syntax** + +``` sql +h3ExactEdgeLengthRads(index) +``` + +**Parameter** + +- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Exact edge length in radians. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3ExactEdgeLengthRads(1310277011704381439) AS exactEdgeLengthRads;; +``` + +Result: + +``` text +┌──exactEdgeLengthRads─┐ +│ 0.030677980118976447 │ +└──────────────────────┘ +``` + +## h3NumHexagons {#h3numhexagons} + +Returns the number of unique H3 indices at the given resolution. + +**Syntax** + +``` sql +h3NumHexagons(resolution) +``` + +**Parameter** + +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Number of H3 indices. + +Type: [Int64](../../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT h3NumHexagons(3) AS numHexagons; +``` + +Result: + +``` text +┌─numHexagons─┐ +│ 41162 │ +└─────────────┘ +``` [Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) diff --git a/docs/en/sql-reference/statements/check-table.md b/docs/en/sql-reference/statements/check-table.md index bc89b11ae4d..c9ad40860f7 100644 --- a/docs/en/sql-reference/statements/check-table.md +++ b/docs/en/sql-reference/statements/check-table.md @@ -46,7 +46,7 @@ CHECK TABLE test_table; └───────────┴───────────┴─────────┘ ``` -If `check_query_single_value_result` = 0, the `CHECK TABLE` query shows the general table check status. +If `check_query_single_value_result` = 1, the `CHECK TABLE` query shows the general table check status. ```sql SET check_query_single_value_result = 1; diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index c3e54545549..7bbbb6f32bd 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -252,7 +252,6 @@ CREATE TABLE codec_example ENGINE = MergeTree() ``` - ### Encryption Codecs {#create-query-encryption-codecs} These codecs don't actually compress data, but instead encrypt data on disk. These are only available when an encryption key is specified by [encryption](../../../operations/server-configuration-parameters/settings.md#server-settings-encryption) settings. Note that encryption only makes sense at the end of codec pipelines, because encrypted data usually can't be compressed in any meaningful way. @@ -260,6 +259,7 @@ These codecs don't actually compress data, but instead encrypt data on disk. The Encryption codecs: - `CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. + - `CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content). @@ -269,7 +269,7 @@ These codecs use a fixed nonce and encryption is therefore deterministic. This m !!! attention "Attention" If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. - + **Example** ```sql diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index dfa065f5d0a..5dfcf891439 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -43,7 +43,7 @@ User host is a host from which a connection to ClickHouse server could be establ - `HOST ANY` — User can connect from any location. This is a default option. - `HOST LOCAL` — User can connect only locally. - `HOST NAME 'fqdn'` — User host can be specified as FQDN. For example, `HOST NAME 'mysite.com'`. -- `HOST NAME REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST NAME REGEXP '.*\.mysite\.com'`. +- `HOST REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST REGEXP '.*\.mysite\.com'`. - `HOST LIKE 'template'` — Allows you to use the [LIKE](../../../sql-reference/functions/string-search-functions.md#function-like) operator to filter the user hosts. For example, `HOST LIKE '%'` is equivalent to `HOST ANY`, `HOST LIKE '%.mysite.com'` filters all the hosts in the `mysite.com` domain. Another way of specifying host is to use `@` syntax following the username. Examples: diff --git a/docs/en/sql-reference/statements/use.md b/docs/en/sql-reference/statements/use.md index 41cba58bb9d..841c23d333d 100644 --- a/docs/en/sql-reference/statements/use.md +++ b/docs/en/sql-reference/statements/use.md @@ -3,14 +3,14 @@ toc_priority: 53 toc_title: USE --- -# USE 语句 {#use} +# USE Statement {#use} ``` sql USE db ``` -用于设置会话的当前数据库。 +Lets you set the current database for the session. -如果查询语句中没有在表名前面以加点的方式指明数据库名, 则用当前数据库进行搜索。 +The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name. -使用 HTTP 协议时无法进行此查询,因为没有会话的概念。 +This query can’t be made when using the HTTP protocol, since there is no concept of a session. diff --git a/docs/ko/images/column-oriented.gif b/docs/ko/images/column-oriented.gif new file mode 100644 index 00000000000..d5ac7c82848 Binary files /dev/null and b/docs/ko/images/column-oriented.gif differ diff --git a/docs/ko/images/logo.svg b/docs/ko/images/logo.svg new file mode 100644 index 00000000000..b5ab923ff65 --- /dev/null +++ b/docs/ko/images/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/ko/images/play.png b/docs/ko/images/play.png new file mode 100644 index 00000000000..b75aebe4089 Binary files /dev/null and b/docs/ko/images/play.png differ diff --git a/docs/ko/images/row-oriented.gif b/docs/ko/images/row-oriented.gif new file mode 100644 index 00000000000..41395b5693e Binary files /dev/null and b/docs/ko/images/row-oriented.gif differ diff --git a/docs/ko/index.md b/docs/ko/index.md new file mode 100644 index 00000000000..f2a6396c069 --- /dev/null +++ b/docs/ko/index.md @@ -0,0 +1,94 @@ +--- +toc_priority: 0 +toc_title: 목차 +--- + +# ClickHouse란? {#what-is-clickhouse} + +ClickHouse® 는 query의 온라인 분석 처리(OLAP)를 위한 열 지향(column-oriented) 데이터베이스 관리 시스템(DBMS)입니다. + +"보통의" 행 지향(row-oriented) DMBS에서는 데이터가 다음과 같은 순서로 저장됩니다. + +| row | WatchID | JavaEnable | Title | GoodEvent | EventTime | +|-----|-------------|------------|--------------------|-----------|---------------------| +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | … | … | … | … | … | + +즉, 행과 관련된 모든 값들은 물리적으로 나란히 저장됩니다. + +행 지향(row-oriented) DMBS의 예시로는 MySQL, Postgres, 그리고 MS SQL 서버 등이 있습니다. + +열 지향 (column-oriented) DBMS에서는 데이터가 아래와 같은 방식으로 저장됩니다: + +| Row: | #0 | #1 | #2 | #N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | + +이 예에서는 데이터가 정렬된 순서만을 보여줍니다. 다른 열의 값들은 서로 분리되어 저장되고, 같은 열의 정보들은 함께 저장됩니다. + +열 지향(column-oriented) DBMS 의 종류는 Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, 그리고 kdb+ 등이 있습니다. + +데이터를 저장하기 위한 서로 다른 순서는 다른 시나리오에 더 적합합니다. 데이터 접근 시나리오는 쿼리가 수행되는 빈도, 비율 및 비율을 나타내거나, 각 쿼리 유형(행, 열 및 바이트)에 대해 읽은 데이터의 양 데이터 읽기와 업데이트 사이의 관계, 데이터의 작업 크기 및 로컬에서 사용되는 방법 트랜잭션이 사용되는지 여부, 트랜잭션이 얼마나 격리되어 있는지, 데이터 복제 및 논리적 무결성에 대한 요구 사항, 각 쿼리 유형에 대한 대기 시간 및 처리량 요구 사항 등이 있습니다. + +시스템의 부하가 높을수록 사용 시나리오의 요구 사항에 맞게 시스템 설정을 사용자 지정하는 것이 더 중요하며 이 사용자 지정은 더욱 세분화됩니다. 상당히 다른 시나리오에 똑같이 적합한 시스템은 없습니다. 만약 높은 부하에서 시스템이 넓은 시나리오 집합에 대해 적응한다면 시스템은 모든 시나리오를 모두 제대로 처리하지 못하거나 가능한 시나리오 중 하나 또는 몇 개에 대해서만 잘 작동할 것입니다. + +## OLAP 시나리오의 중요 속성들 {#key-properties-of-olap-scenario} + +- 요청(request)의 대부분은 읽기 접근에 관한 것입니다. +- 데이터는 단일 행이 아니라 상당히 큰 일괄 처리(\> 1000개 행)로 업데이트됩니다. 또는 전혀 업데이트되지 않습니다. +- 데이터는 DB에 추가되지만 수정되지는 않습니다. +- 읽기의 경우 DB에서 상당히 많은 수의 행이 추출되지만 열은 일부만 추출됩니다. +- 테이블은 "넓습니다". 이는 열의 수가 많다는 것을 의미합니다. +- 쿼리는 상대적으로 드뭅니다(일반적으로 서버당 수백 또는 초당 쿼리 미만). +- 간단한 쿼리의 경우 약 50ms의 대기 시간이 허용됩니다. +- 열 값은 숫자와 짧은 문자열(예: URL당 60바이트)과 같이 상당히 작습니다 +- 단일 쿼리를 처리할 때 높은 처리량이 필요합니다(서버당 초당 최대 수십억 행). +- 트랜잭션이 필요하지 않습니다. +- 데이터 일관성에 대한 요구 사항이 낮습니다. +- 쿼리당 하나의 큰 테이블이 존재하고 하나를 제외한 모든 테이블은 작습니다. +- 쿼리 결과가 원본 데이터보다 훨씬 작습니다. 즉, 데이터가 필터링되거나 집계되므로 결과가 단일 서버의 RAM에 꼭 들어맞습니다. + +OLAP 시나리오가 다른 일반적인 시나리오(OLTP 또는 키-값 액세스와 같은)와 매우 다르다는 것을 쉽게 알 수 있습니다. 따라서 적절한 성능을 얻으려면 분석 쿼리를 처리하기 위해 OLTP 또는 키-값 DB를 사용하는 것은 의미가 없습니다. 예를 들어 분석에 MongoDB나 Redis를 사용하려고 하면 OLAP 데이터베이스에 비해 성능이 매우 저하됩니다. + +## 왜 열 지향 데이터베이스가 OLAP 시나리오에 적합한가{#why-column-oriented-databases-work-better-in-the-olap-scenario} + +열 지향(column-oriented) 데이터베이스는 OLAP 시나리오에 더 적합합니다. 대부분의 쿼리를 처리하는 데 있어서 행 지향(row-oriented) 데이터베이스보다 100배 이상 빠릅니다. 그 이유는 아래에 자세히 설명되어 있지만 사실은 시각적으로 더 쉽게 설명할 수 있습니다. + +**행 지향 DBMS** + +![Row-oriented](images/row-oriented.gif#) + +**열 지향 DBMS** + +![Column-oriented](images/column-oriented.gif#) + +차이가 보이시나요? + +### 입출력 {#inputoutput} + +1. 분석 쿼리의 경우 적은 수의 테이블 열만 읽어야 합니다. 열 지향 데이터베이스에서는 필요한 데이터만 읽을 수 있습니다. 예를 들어 100개 중 5개의 열이 필요한 경우 I/O가 20배 감소할 것으로 예상할 수 있습니다. +2. 데이터는 패킷으로 읽히므로 압축하기가 더 쉽습니다. 열의 데이터도 압축하기 쉽습니다. 이것은 I/O의 볼륨을 더욱 감소시킵니다. +3. 감소된 I/O로 인해 시스템 캐시에 더 많은 데이터가 들어갑니다. + +예를 들어, "각 광고 플랫폼에 대한 레코드 수 계산" 쿼리는 압축되지 않은 1바이트를 차지하는 하나의 "광고 플랫폼 ID" 열을 읽어야 합니다. 트래픽의 대부분이 광고 플랫폼에서 발생하지 않은 경우 이 열의 최소 10배 압축을 기대할 수 있습니다. 빠른 압축 알고리즘을 사용하면 초당 최소 몇 기가바이트의 압축되지 않은 데이터의 속도로 데이터 압축 해제가 가능합니다. 즉, 이 쿼리는 단일 서버에서 초당 약 수십억 행의 속도로 처리될 수 있습니다. 이 속도는 정말 실제로 달성됩니다. + +### CPU {#cpu} + +쿼리를 수행하려면 많은 행을 처리해야 하므로 별도의 행이 아닌 전체 벡터에 대한 모든 연산을 디스패치하거나 쿼리 엔진을 구현하여 디스패치 비용이 거의 들지 않습니다. 반쯤 괜찮은 디스크 하위 시스템에서 이렇게 하지 않으면 쿼리 인터프리터가 불가피하게 CPU를 정지시킵니다. 데이터를 열에 저장하고 가능한 경우 열별로 처리하는 것이 좋습니다. + +이를 수행하기위한 두가지 방법이 있습니다. + +1. 벡터 엔진. 모든 연산은 별도의 값 대신 벡터에 대해 작성됩니다. 즉, 작업을 자주 호출할 필요가 없으며 파견 비용도 무시할 수 있습니다. 작업 코드에는 최적화된 내부 주기가 포함되어 있습니다. +2. 코드 생성. 쿼리에 대해 생성된 코드에는 모든 간접 호출이 있습니다. + +이것은 단순한 쿼리를 실행할 때 의미가 없기 때문에 "일반" 데이터베이스에서는 수행되지 않습니다. 그러나 예외가 있습니다. 예를 들어 MemSQL은 코드 생성을 사용하여 SQL 쿼리를 처리할 때 대기 시간을 줄입니다. (비교되게, 분석 DBMS는 대기 시간이 아닌 처리량 최적화가 필요합니다.) + +CPU 효율성을 위해 쿼리 언어는 선언적(SQL 또는 MDX)이거나 최소한 벡터(J, K)여야 합니다. 쿼리는 최적화를 허용하는 암시적 루프만 포함해야 합니다. + +{## [원문](https://clickhouse.com/docs/en/) ##} diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index d2cc133e0c9..5d667ef8238 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -105,7 +105,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ```xml - + ``` @@ -118,7 +118,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part 00112233445566778899aabbccddeeff - + 1 diff --git a/docs/ru/sql-reference/functions/geo/h3.md b/docs/ru/sql-reference/functions/geo/h3.md index 8f7b98f0a45..78e7bf2fa86 100644 --- a/docs/ru/sql-reference/functions/geo/h3.md +++ b/docs/ru/sql-reference/functions/geo/h3.md @@ -4,11 +4,11 @@ toc_title: "Функции для работы с индексами H3" # Функции для работы с индексами H3 {#h3index} -[H3](https://eng.uber.com/h3/) — это система геокодирования, которая делит поверхность Земли на равные шестигранные ячейки. Система поддерживает иерархию (вложенность) ячеек, т.е. каждый "родительский" шестигранник может быть поделен на семь одинаковых вложенных "дочерних" шестигранников, и так далее. +[H3](https://eng.uber.com/h3/) — это система геокодирования, которая делит поверхность Земли на равные шестиугольные ячейки. Система поддерживает иерархию (вложенность) ячеек, т.е. каждый "родительский" шестиугольник может быть поделен на семь одинаковых вложенных "дочерних" шестиугольников, и так далее. Уровень вложенности называется "разрешением" и может принимать значение от `0` до `15`, где `0` соответствует "базовым" ячейкам самого верхнего уровня (наиболее крупным). -Для каждой точки, имеющей широту и долготу, можно получить 64-битный индекс H3, соответствующий номеру шестигранной ячейки, где эта точка находится. +Для каждой точки, имеющей широту и долготу, можно получить 64-битный индекс H3, соответствующий номеру шестриугольной ячейки, где эта точка находится. Индексы H3 используются, в основном, для геопозиционирования и расчета расстояний. @@ -24,7 +24,7 @@ h3IsValid(h3index) **Параметр** -- `h3index` — идентификатор шестигранника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — идентификатор шестриугольника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -61,7 +61,7 @@ h3GetResolution(h3index) **Параметр** -- `h3index` — идентификатор шестигранника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — идентификатор шестиугольника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -88,7 +88,7 @@ SELECT h3GetResolution(639821929606596015) AS resolution; ## h3EdgeAngle {#h3edgeangle} -Рассчитывает средний размер стороны шестигранника [H3](#h3index) в градусах. +Рассчитывает средний размер стороны шестиугольника [H3](#h3index) в градусах. **Синтаксис** @@ -102,7 +102,7 @@ h3EdgeAngle(resolution) **Возвращаемое значение** -- Средняя длина стороны шестигранника [H3](#h3index) в градусах. Тип данных: [Float64](../../../sql-reference/data-types/float.md). +- Средняя длина стороны шестиугольника [H3](#h3index) в градусах. Тип данных: [Float64](../../../sql-reference/data-types/float.md). **Пример** @@ -122,7 +122,7 @@ SELECT h3EdgeAngle(10) AS edgeAngle; ## h3EdgeLengthM {#h3edgelengthm} -Рассчитывает средний размер стороны шестигранника [H3](#h3index) в метрах. +Рассчитывает средний размер стороны шестиугольника [H3](#h3index) в метрах. **Синтаксис** @@ -136,7 +136,7 @@ h3EdgeLengthM(resolution) **Возвращаемое значение** -- Средняя длина стороны шестигранника H3 в метрах, тип — [Float64](../../../sql-reference/data-types/float.md). +- Средняя длина стороны шестиугольника H3 в метрах, тип — [Float64](../../../sql-reference/data-types/float.md). **Пример** @@ -172,7 +172,7 @@ geoToH3(lon, lat, resolution) **Возвращаемые значения** -- Порядковый номер шестигранника. +- Порядковый номер шестиугольника. - 0 в случае ошибки. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). @@ -195,7 +195,7 @@ SELECT geoToH3(37.79506683, 55.71290588, 15) AS h3Index; ## h3ToGeo {#h3togeo} -Возвращает географические координаты долготы и широты, соответствующие указанному [H3](#h3index)-индексу. +Возвращает географические координаты долготы и широты центра шестиугольника, соответствующие указанному [H3](#h3index)-индексу. **Синтаксис** @@ -265,7 +265,7 @@ SELECT h3ToGeoBoundary(644325524701193974) AS coordinates; ## h3kRing {#h3kring} -Возвращает [H3](#h3index)-индексы шестигранников в радиусе `k` от данного в произвольном порядке. +Возвращает [H3](#h3index)-индексы шестиугольника в радиусе `k` от данного в произвольном порядке. **Синтаксис** @@ -275,7 +275,7 @@ h3kRing(h3index, k) **Аргументы** -- `h3index` — идентификатор шестигранника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — идентификатор шестиугольника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). - `k` — радиус. Тип данных: [целое число](../../../sql-reference/data-types/int-uint.md) **Возвращаемые значения** @@ -607,7 +607,7 @@ h3IsResClassIII(index) **Параметр** -- `index` — порядковый номер шестигранника. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — порядковый номер шестиугольника. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -644,7 +644,7 @@ h3IsPentagon(index) **Параметр** -- `index` — порядковый номер шестигранника. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — порядковый номер шестиугольника. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** diff --git a/docs/ru/sql-reference/functions/logical-functions.md b/docs/ru/sql-reference/functions/logical-functions.md index 6ba55dca30f..ac4e226b2d2 100644 --- a/docs/ru/sql-reference/functions/logical-functions.md +++ b/docs/ru/sql-reference/functions/logical-functions.md @@ -70,7 +70,7 @@ SELECT and(NULL, 1, 10, -2); **Синтаксис** ``` sql -and(val1, val2...) +or(val1, val2...) ``` Чтобы вычислять функцию `or` по короткой схеме, используйте настройку [short_circuit_function_evaluation](../../operations/settings/settings.md#short-circuit-function-evaluation). Если настройка включена, то выражение `vali` вычисляется только для строк, где условие `((NOT val1) AND (NOT val2) AND ... AND (NOT val{i-1}))` верно. Например, при выполнении запроса `SELECT or(number = 0, intDiv(1, number) != 0) FROM numbers(10)` не будет сгенерировано исключение из-за деления на ноль. diff --git a/docs/ru/sql-reference/statements/create/table.md b/docs/ru/sql-reference/statements/create/table.md index 6601276d573..8567a0ff2db 100644 --- a/docs/ru/sql-reference/statements/create/table.md +++ b/docs/ru/sql-reference/statements/create/table.md @@ -246,6 +246,46 @@ CREATE TABLE codec_example ENGINE = MergeTree() ``` +### Кодеки шифрования {#create-query-encryption-codecs} + +Эти кодеки не сжимают данные, вместо этого они зашифровывают данные на диске. Воспользоваться кодеками можно, только когда ключ шифрования задан параметрами [шифрования](../../../operations/server-configuration-parameters/settings.md#server-settings-encryption). Обратите внимание: ставить кодеки шифрования имеет смысл в самый конец цепочки кодеков, потому что зашифрованные данные, как правило, нельзя сжать релевантным образом. + +Кодеки шифрования: + +- `CODEC('AES-128-GCM-SIV')` — Зашифровывает данные с помощью AES-128 в режиме [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV. +- `CODEC('AES-256-GCM-SIV')` — Зашифровывает данные с помощью AES-256 в режиме GCM-SIV. + +Эти кодеки используют фиксированный одноразовый ключ шифрования. Таким образом, это детерминированное шифрование. Оно совместимо с поддерживающими дедупликацию движками, в частности, [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md). Однако у шифрования имеется недостаток: если дважды зашифровать один и тот же блок данных, текст на выходе получится одинаковым, и злоумышленник, у которого есть доступ к диску, заметит эту эквивалентность (при этом доступа к содержимому он не получит). + +!!! attention "Внимание" + Большинство движков, включая семейство `MergeTree`, создают на диске индексные файлы, не применяя кодеки. А значит, в том случае, если зашифрованный столбец индексирован, на диске отобразится незашифрованный текст. + +!!! attention "Внимание" + Если вы выполняете запрос SELECT с упоминанием конкретного значения в зашифрованном столбце (например, при использовании секции WHERE), это значение может появиться в [system.query_log](../../../operations/system-tables/query_log.md). Рекомендуем отключить логирование. + +**Пример** + +```sql +CREATE TABLE mytable +( + x String Codec(AES_128_GCM_SIV) +) +ENGINE = MergeTree ORDER BY x; +``` + +!!!note "Замечание" + Если необходимо применить сжатие, это нужно явно прописать в запросе. Без этого будет выполнено только шифрование данных. + +**Пример** + +```sql +CREATE TABLE mytable +( + x String Codec(Delta, LZ4, AES_128_GCM_SIV) +) +ENGINE = MergeTree ORDER BY x; +``` + ## Временные таблицы {#temporary-tables} ClickHouse поддерживает временные таблицы со следующими характеристиками: diff --git a/docs/ru/sql-reference/statements/create/user.md b/docs/ru/sql-reference/statements/create/user.md index f6248d97ba9..5c82424892b 100644 --- a/docs/ru/sql-reference/statements/create/user.md +++ b/docs/ru/sql-reference/statements/create/user.md @@ -43,7 +43,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] - `HOST ANY` — Пользователь может подключиться с любого хоста. Используется по умолчанию. - `HOST LOCAL` — Пользователь может подключиться только локально. - `HOST NAME 'fqdn'` — Хост задается через FQDN. Например, `HOST NAME 'mysite.com'`. -- `HOST NAME REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST NAME REGEXP '.*\.mysite\.com'`. +- `HOST REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST REGEXP '.*\.mysite\.com'`. - `HOST LIKE 'template'` — Позволяет использовать оператор [LIKE](../../functions/string-search-functions.md#function-like) для фильтрации хостов. Например, `HOST LIKE '%'` эквивалентен `HOST ANY`; `HOST LIKE '%.mysite.com'` разрешает подключение со всех хостов в домене `mysite.com`. Также, чтобы задать хост, вы можете использовать `@` вместе с именем пользователя. Примеры: diff --git a/docs/tools/blog.py b/docs/tools/blog.py index bfc8c0908e9..e4fb6f77865 100644 --- a/docs/tools/blog.py +++ b/docs/tools/blog.py @@ -62,7 +62,7 @@ def build_for_lang(lang, args): strict=True, theme=theme_cfg, nav=blog_nav, - copyright='©2016–2021 ClickHouse, Inc.', + copyright='©2016–2022 ClickHouse, Inc.', use_directory_urls=True, repo_name='ClickHouse/ClickHouse', repo_url='https://github.com/ClickHouse/ClickHouse/', @@ -97,10 +97,6 @@ def build_for_lang(lang, args): with open(os.path.join(args.blog_output_dir, lang, 'rss.xml'), 'w') as f: f.write(rss_template.render({'config': raw_config})) - # TODO: AMP for blog - # if not args.skip_amp: - # amp.build_amp(lang, args, cfg) - logging.info(f'Finished building {lang} blog') except exceptions.ConfigurationError as e: diff --git a/docs/zh/faq/general/columnar-database.md b/docs/zh/faq/general/columnar-database.md deleted file mode 120000 index b7557b62010..00000000000 --- a/docs/zh/faq/general/columnar-database.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/faq/general/columnar-database.md \ No newline at end of file diff --git a/docs/zh/faq/general/columnar-database.md b/docs/zh/faq/general/columnar-database.md new file mode 100644 index 00000000000..185deaa7406 --- /dev/null +++ b/docs/zh/faq/general/columnar-database.md @@ -0,0 +1,25 @@ +--- +title: 什么是列存储数据库? +toc_hidden: true +toc_priority: 101 +--- + +# 什么是列存储数据库? {#what-is-a-columnar-database} + +列存储数据库独立存储每个列的数据。这只允许从磁盘读取任何给定查询中使用的列的数据。其代价是,影响整行的操作会按比例变得更昂贵。列存储数据库的同义词是面向列的数据库管理系统。ClickHouse就是这样一个典型的例子。 + +列存储数据库的主要优点是: + +- 查询只使用许多列其中的少数列。 +— 聚合对大量数据的查询。 +— 按列压缩。 + +下面是构建报表时传统的面向行系统和柱状数据库之间的区别: + +**传统行存储** +!(传统行存储)(https://clickhouse.com/docs/en/images/row-oriented.gif) + +**列存储** +!(列存储)(https://clickhouse.com/docs/en/images/column-oriented.gif) + +列存储数据库是分析应用程序的首选,因为它允许在一个表中有许多列以防万一,但不会在读取查询执行时为未使用的列付出代价。面向列的数据库是为大数据处理而设计的,因为和数据仓库一样,它们通常使用分布式的低成本硬件集群来提高吞吐量。ClickHouse结合了[分布式](../../engines/table-engines/special/distributed.md)和[复制式](../../engines/table-engines/mergetree-family/replication.md)两类表。 \ No newline at end of file diff --git a/docs/zh/faq/general/dbms-naming.md b/docs/zh/faq/general/dbms-naming.md deleted file mode 120000 index 0df856af0ca..00000000000 --- a/docs/zh/faq/general/dbms-naming.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/faq/general/dbms-naming.md \ No newline at end of file diff --git a/docs/zh/faq/general/dbms-naming.md b/docs/zh/faq/general/dbms-naming.md new file mode 100644 index 00000000000..8d4353f9322 --- /dev/null +++ b/docs/zh/faq/general/dbms-naming.md @@ -0,0 +1,17 @@ +--- +title: "\u201CClickHouse\u201D 有什么含义?" +toc_hidden: true +toc_priority: 10 +--- + +# “ClickHouse” 有什么含义? {#what-does-clickhouse-mean} + +它是“**点击**流”和“数据**仓库**”的组合。它来自于Yandex最初的用例。在Metrica网站上,ClickHouse本应该保存人们在互联网上的所有点击记录,现在它仍然在做这项工作。你可以在[ClickHouse history](../../introduction/history.md)页面上阅读更多关于这个用例的信息。 + +这个由两部分组成的意思有两个结果: + +- 唯一正确的写“Click**H** house”的方式是用大写H。 +- 如果需要缩写,请使用“**CH**”。由于一些历史原因,缩写CK在中国也很流行,主要是因为中文中最早的一个关于ClickHouse的演讲使用了这种形式。 + +!!! info “有趣的事实” + 多年后ClickHouse闻名于世, 这种命名方法:结合各有深意的两个词被赞扬为最好的数据库命名方式, 卡内基梅隆大学数据库副教授[Andy Pavlo做的研究](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html) 。ClickHouse与Postgres共同获得“史上最佳数据库名”奖。 diff --git a/docs/zh/faq/general/index.md b/docs/zh/faq/general/index.md deleted file mode 120000 index 5ff33ccb360..00000000000 --- a/docs/zh/faq/general/index.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/faq/general/index.md \ No newline at end of file diff --git a/docs/zh/faq/general/index.md b/docs/zh/faq/general/index.md new file mode 100644 index 00000000000..65312a48d8f --- /dev/null +++ b/docs/zh/faq/general/index.md @@ -0,0 +1,27 @@ +--- +title: ClickHouse 有关常见问题 +toc_hidden_folder: true +toc_priority: 1 +toc_title: General +--- + +# ClickHouse 有关常见问题 {#general-questions} + +常见问题: + +- [什么是 ClickHouse?](../../index.md#what-is-clickhouse) +- [为何 ClickHouse 如此迅捷?](../../faq/general/why-clickhouse-is-so-fast.md) +- [谁在使用 ClickHouse?](../../faq/general/who-is-using-clickhouse.md) +- [“ClickHouse” 有什么含义?](../../faq/general/dbms-naming.md) +- [ “Не тормозит” 有什么含义?](../../faq/general/ne-tormozit.md) +- [什么是 OLAP?](../../faq/general/olap.md) +- [什么是列存储数据库?](../../faq/general/columnar-database.md) +- [为何不使用 MapReduce等技术?](../../faq/general/mapreduce.md) +- [我如何为 ClickHouse贡献代码?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) + + + +!!! info "没找到您需要的内容?" + 请查阅 [其他 F.A.Q. 类别](../../faq/index.md) 或者从左侧导航栏浏览其他文档 + +{## [原始文档](https://clickhouse.com/docs/en/faq/general/) ##} diff --git a/docs/zh/faq/general/mapreduce.md b/docs/zh/faq/general/mapreduce.md deleted file mode 120000 index 49b79ad4841..00000000000 --- a/docs/zh/faq/general/mapreduce.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/faq/general/mapreduce.md \ No newline at end of file diff --git a/docs/zh/faq/general/mapreduce.md b/docs/zh/faq/general/mapreduce.md new file mode 100644 index 00000000000..f70ca8a2583 --- /dev/null +++ b/docs/zh/faq/general/mapreduce.md @@ -0,0 +1,13 @@ +--- +title: 为何不使用 MapReduce等技术? +toc_hidden: true +toc_priority: 110 +--- + +# 为何不使用 MapReduce等技术? {#why-not-use-something-like-mapreduce} + +我们可以将MapReduce这样的系统称为分布式计算系统,其中的reduce操作是基于分布式排序的。这个领域中最常见的开源解决方案是[Apache Hadoop](http://hadoop.apache.org)。Yandex使用其内部解决方案YT。 + +这些系统不适合用于在线查询,因为它们的延迟很大。换句话说,它们不能被用作网页界面的后端。这些类型的系统对于实时数据更新并不是很有用。如果操作的结果和所有中间结果(如果有的话)都位于单个服务器的内存中,那么分布式排序就不是执行reduce操作的最佳方式,这通常是在线查询的情况。在这种情况下,哈希表是执行reduce操作的最佳方式。优化map-reduce任务的一种常见方法是使用内存中的哈希表进行预聚合(部分reduce)。用户手动执行此优化。在运行简单的map-reduce任务时,分布式排序是导致性能下降的主要原因之一。 + +大多数MapReduce实现允许你在集群中执行任意代码。但是声明性查询语言更适合于OLAP,以便快速运行实验。例如,Hadoop有Hive和Pig。还可以考虑使用Cloudera Impala或Shark(已经过时了)来支持Spark,以及Spark SQL、Presto和Apache Drill。与专门的系统相比,运行这些任务的性能是非常不理想的,但是相对较高的延迟使得使用这些系统作为web界面的后端是不现实的。 diff --git a/docs/zh/faq/index.md b/docs/zh/faq/index.md index dd29d73a013..9887d2c6c0a 100644 --- a/docs/zh/faq/index.md +++ b/docs/zh/faq/index.md @@ -19,6 +19,7 @@ toc_priority: 76 - [什么是 OLAP?](../faq/general/olap.md) - [什么是列存储数据库?](../faq/general/columnar-database.md) - [为何不使用 MapReduce等技术?](../faq/general/mapreduce.md) + - [我如何为 ClickHouse贡献代码?](../faq/general/how-do-i-contribute-code-to-clickhouse.md) - **[应用案例](../faq/use-cases/index.md)** - [我能把 ClickHouse 作为时序数据库来使用吗?](../faq/use-cases/time-series.md) - [我能把 ClickHouse 作为 key-value 键值存储吗?](../faq/use-cases/key-value.md) diff --git a/docs/zh/faq/use-cases/time-series.md b/docs/zh/faq/use-cases/time-series.md deleted file mode 120000 index 55cbcfc243f..00000000000 --- a/docs/zh/faq/use-cases/time-series.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/faq/use-cases/time-series.md \ No newline at end of file diff --git a/docs/zh/faq/use-cases/time-series.md b/docs/zh/faq/use-cases/time-series.md new file mode 100644 index 00000000000..045179c8c6e --- /dev/null +++ b/docs/zh/faq/use-cases/time-series.md @@ -0,0 +1,21 @@ +--- +title: 我能把 ClickHouse 当做时序数据库来使用吗? +toc_hidden: true +toc_priority: 101 +--- + +# 我能把 ClickHouse 当做时序数据库来使用吗? {#can-i-use-clickhouse-as-a-time-series-database} + +ClickHouse是一个通用的数据存储解决方案[OLAP](../../faq/general/olap.md)的工作负载,而有许多专门的时间序列数据库管理系统。然而,ClickHouse的[专注于查询执行速度](../../faq/general/why-clickhouse-is-so-fast.md)使得它在许多情况下的性能优于专门的系统。关于这个话题有很多独立的基准,所以我们不打算在这里进行论述。相反,让我们将重点放在ClickHouse的重要功能(如果这是你的用例)上。 + + + +首先,有 **[specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)**,这是典型的时间序列。无论是常见的算法,如“DoubleDelta”和“Gorilla”,或特定的ClickHouse 数据类型如“T64”。 + + + +其次,时间序列查询通常只访问最近的数据,比如一天或一周以前的数据。使用具有快速nVME/SSD驱动器和高容量HDD驱动器的服务器是有意义的。ClickHouse [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)特性允许配置在快速硬盘上保持新鲜的热数据,并随着数据的老化逐渐移动到较慢的硬盘上。如果您的需求需要,也可以汇总或删除更旧的数据。 + + + +尽管这与ClickHouse存储和处理原始数据的理念相违背,但你可以使用[materialized views](../../sql-reference/statements/create/view.md)来适应更紧迫的延迟或成本需求。 \ No newline at end of file diff --git a/docs/zh/operations/requirements.md b/docs/zh/operations/requirements.md index c3013f738a2..964d7aa34f4 100644 --- a/docs/zh/operations/requirements.md +++ b/docs/zh/operations/requirements.md @@ -1,59 +1,59 @@ --- toc_priority: 44 -toc_title: "要求" +toc_title: "必备条件" --- -# 要求 {#requirements} +# 必备条件 {#requirements} ## CPU {#cpu} -对于从预构建的deb包进行安装,请使用具有x86_64架构并支持SSE4.2指令的CPU。 要使用不支持SSE4.2或具有AArch64或PowerPC64LE体系结构的处理器运行ClickHouse,您应该从源代码构建ClickHouse。 +如果您使用预编译的DEB/RPM包安装ClickHouse,请使用支持SSE4.2指令集的x86_64架构的CPU。如果需要在不支持SSE4.2指令集的CPU上,或者在AArch64(ARM)和PowerPC64LE(IBM Power)架构上运行ClickHouse,您应该从源码编译ClickHouse。 -ClickHouse实现并行数据处理并使用所有可用的硬件资源。 在选择处理器时,考虑到ClickHouse在具有大量内核但时钟速率较低的配置中的工作效率要高于具有较少内核和较高时钟速率的配置。 例如,具有2600MHz的16核心优于具有3600MHz的8核心。 +ClickHouse实现了并行数据处理,处理时会使用所有的可用资源。在选择处理器时,请注意:ClickHouse在具有大量计算核、时钟频率稍低的平台上比计算核少、时钟频率高的平台上效率更高。例如,ClickHouse在16核 2.6GHz的CPU上运行速度高于8核 3.6GHz的CPU。 -建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了典型工作负载的性能。 +建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了正常工作负载的性能。 ## RAM {#ram} -我们建议使用至少4GB的RAM来执行重要的查询。 ClickHouse服务器可以使用少得多的RAM运行,但它需要处理查询的内存。 +我们建议使用至少4GB的内存来执行重要的查询。 ClickHouse服务器可以使用很少的内存运行,但它需要一定量的内存用于处理查询。 -RAM所需的体积取决于: +ClickHouse所需内存取决于: -- 查询的复杂性。 -- 查询中处理的数据量。 +- 查询的复杂程度。 +- 查询处理的数据量。 -要计算所需的RAM体积,您应该估计临时数据的大小 [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) 和您使用的其他操作。 +要计算所需的内存大小,您应该考虑用于[GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause)、[DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct)、[JOIN](../sql-reference/statements/select/join.md#select-join) 和其他操作所需的临时数据量。 -ClickHouse可以使用外部存储器来存储临时数据。看 [在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) 有关详细信息。 +ClickHouse可以使用外部存储器来存储临时数据。详情请见[在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory)。 ## 交换文件 {#swap-file} -禁用生产环境的交换文件。 +请在生产环境禁用交换文件。 ## 存储子系统 {#storage-subsystem} 您需要有2GB的可用磁盘空间来安装ClickHouse。 -数据所需的存储量应单独计算。 评估应包括: +数据所需的存储空间应单独计算。预估存储容量时请考虑: -- 估计数据量。 +- 数据量 - 您可以采取数据的样本并从中获取行的平均大小。 然后将该值乘以计划存储的行数。 + 您可以对数据进行采样并计算每行的平均占用空间。然后将该值乘以计划存储的行数。 -- 数据压缩系数。 +- 数据压缩比 - 要估计数据压缩系数,请将数据的样本加载到ClickHouse中,并将数据的实际大小与存储的表的大小进行比较。 例如,点击流数据通常被压缩6-10倍。 + 要计算数据压缩比,请将样本数据写入ClickHouse,并将原始数据大小与ClickHouse实际存储的数据进行比较。例如,用户点击行为的原始数据压缩比通常为6-10。 -要计算要存储的最终数据量,请将压缩系数应用于估计的数据量。 如果计划将数据存储在多个副本中,则将估计的量乘以副本数。 +请将原始数据的大小除以压缩比来获得实际所需存储的大小。如果您打算将数据存放于几个副本中,请将存储容量乘上副本数。 ## 网络 {#network} -如果可能的话,使用10G或更高级别的网络。 +如果可能的话,请使用10G或更高级别的网络。 -网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。 此外,网络速度会影响复制过程。 +网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。此外,网络速度会影响复制过程。 ## 软件 {#software} -ClickHouse主要是为Linux系列操作系统开发的。 推荐的Linux发行版是Ubuntu。 `tzdata` 软件包应安装在系统中。 +ClickHouse主要是为Linux系列操作系统开发的。推荐的Linux发行版是Ubuntu。您需要检查`tzdata`(对于Ubuntu)软件包是否在安装ClickHouse之前已经安装。 -ClickHouse也可以在其他操作系统系列中工作。 查看详细信息 [开始](../getting-started/index.md) 文档的部分。 +ClickHouse也可以在其他操作系统系列中工作。详情请查看[开始](../getting-started/index.md)。 diff --git a/docs/zh/operations/settings/settings-users.md b/docs/zh/operations/settings/settings-users.md index ae75dddab58..d89b880328a 100644 --- a/docs/zh/operations/settings/settings-users.md +++ b/docs/zh/operations/settings/settings-users.md @@ -1,5 +1,5 @@ --- -machine_translated: true +machine_translated: false machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 63 toc_title: "\u7528\u6237\u8BBE\u7F6E" @@ -7,12 +7,12 @@ toc_title: "\u7528\u6237\u8BBE\u7F6E" # 用户设置 {#user-settings} -该 `users` 一节 `user.xml` 配置文件包含用户设置。 +`user.xml` 中的 `users` 配置段包含了用户配置 -!!! note "信息" +!!! note "提示" ClickHouse还支持 [SQL驱动的工作流](../access-rights.md#access-control) 用于管理用户。 我们建议使用它。 -的结构 `users` 科: +`users` 配置段的结构: ``` xml @@ -43,21 +43,21 @@ toc_title: "\u7528\u6237\u8BBE\u7F6E" ``` -### 用户名称/密码 {#user-namepassword} +### user_name/password {#user-namepassword} 密码可以以明文或SHA256(十六进制格式)指定。 -- 以明文形式分配密码 (**不推荐**),把它放在一个 `password` 元素。 +- 以明文形式分配密码 (**不推荐**),把它放在一个 `password` 配置段中。 例如, `qwerty`. 密码可以留空。 -- 要使用其SHA256散列分配密码,请将其放置在 `password_sha256_hex` 元素。 +- 要使用SHA256加密后的密码,请将其放置在 `password_sha256_hex` 配置段。 例如, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. - 如何从shell生成密码的示例: + 从shell生成加密密码的示例: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' @@ -65,19 +65,19 @@ toc_title: "\u7528\u6237\u8BBE\u7F6E" -- 为了与MySQL客户端兼容,密码可以在双SHA1哈希中指定。 放进去 `password_double_sha1_hex` 元素。 +- 为了与MySQL客户端兼容,密码可以设置为双SHA1哈希加密, 请将其放置在 `password_double_sha1_hex` 配置段。 例如, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. - 如何从shell生成密码的示例: + 从shell生成密码的示例: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' 结果的第一行是密码。 第二行是相应的双SHA1哈希。 -### 访问管理 {#access_management-user-setting} +### access_management {#access_management-user-setting} -此设置启用禁用使用SQL驱动 [访问控制和帐户管理](../access-rights.md#access-control) 对于用户。 +此设置可为用户启用或禁用 SQL-driven [访问控制和帐户管理](../access-rights.md#access-control) 。 可能的值: @@ -86,42 +86,42 @@ toc_title: "\u7528\u6237\u8BBE\u7F6E" 默认值:0。 -### 用户名称/网络 {#user-namenetworks} +### user_name/networks {#user-namenetworks} -用户可以从中连接到ClickHouse服务器的网络列表。 +用户访问来源列表 列表中的每个元素都可以具有以下形式之一: -- `` — IP address or network mask. +- `` — IP地址或网络掩码 例: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. -- `` — Hostname. +- `` — 域名 示例: `example01.host.ru`. - 要检查访问,将执行DNS查询,并将所有返回的IP地址与对等地址进行比较。 + 为检查访问,将执行DNS查询,并将所有返回的IP地址与对端地址进行比较。 -- `` — Regular expression for hostnames. +- `` — 域名的正则表达式. 示例, `^example\d\d-\d\d-\d\.host\.ru$` - 要检查访问,a [DNS PTR查询](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) 对对等体地址执行,然后应用指定的正则表达式。 然后,对PTR查询的结果执行另一个DNS查询,并将所有接收到的地址与对等地址进行比较。 我们强烈建议正则表达式以$结尾。 + 为检查访问,[DNS PTR查询](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) 对对端地址执行,然后应用指定的正则表达式。 然后,以PTR查询的结果执行另一个DNS查询,并将所有接收到的地址与对端地址进行比较. 我们强烈建议正则表达式以$结尾. DNS请求的所有结果都将被缓存,直到服务器重新启动。 **例** -要从任何网络打开用户的访问权限,请指定: +要开启任意来源网络的访问, 请指定: ``` xml ::/0 ``` !!! warning "警告" - 从任何网络开放访问是不安全的,除非你有一个防火墙正确配置或服务器没有直接连接到互联网。 + 从任何网络开放访问是不安全的,除非你有一个正确配置的防火墙, 或者服务器没有直接连接到互联网。 -若要仅从本地主机打开访问权限,请指定: +若要限定本机访问, 请指定: ``` xml ::1 @@ -130,22 +130,21 @@ DNS请求的所有结果都将被缓存,直到服务器重新启动。 ### user_name/profile {#user-nameprofile} -您可以为用户分配设置配置文件。 设置配置文件在单独的部分配置 `users.xml` 文件 有关详细信息,请参阅 [设置配置文件](settings-profiles.md). +您可以为用户分配设置配置文件。 设置配置文件在`users.xml` 中有单独的配置段. 有关详细信息,请参阅 [设置配置文件](settings-profiles.md). -### 用户名称/配额 {#user-namequota} +### user_name/quota {#user-namequota} -配额允许您在一段时间内跟踪或限制资源使用情况。 配额在配置 `quotas` -一节 `users.xml` 配置文件。 +配额允许您在一段时间内跟踪或限制资源使用情况。 配额在`users.xml` 中的 `quotas` 配置段下. 您可以为用户分配配额。 有关配额配置的详细说明,请参阅 [配额](../quotas.md#quotas). -### 用户名/数据库 {#user-namedatabases} +### user_name/databases {#user-namedatabases} -在本节中,您可以限制ClickHouse返回的行 `SELECT` 由当前用户进行的查询,从而实现基本的行级安全性。 +在本配置段中,您可以限制ClickHouse中由当前用户进行的 `SELECT` 查询所返回的行,从而实现基本的行级安全性。 **示例** -以下配置强制该用户 `user1` 只能看到的行 `table1` 作为结果 `SELECT` 查询,其中的值 `id` 场是1000。 +以下配置使用户 `user1` 通过SELECT查询只能得到table1中id为1000的行 ``` xml @@ -159,6 +158,6 @@ DNS请求的所有结果都将被缓存,直到服务器重新启动。 ``` -该 `filter` 可以是导致任何表达式 [UInt8](../../sql-reference/data-types/int-uint.md)-键入值。 它通常包含比较和逻辑运算符。 从行 `database_name.table1` 其中,不会为此用户返回为0的筛选结果。 过滤是不兼容的 `PREWHERE` 操作和禁用 `WHERE→PREWHERE` 优化。 +该 `filter` 可以是[UInt8](../../sql-reference/data-types/int-uint.md)编码的任何表达式。 它通常包含比较和逻辑运算符, 当filter返回0时, database_name.table1 的该行结果将不会返回给用户.过滤不兼容 `PREWHERE` 操作并禁用 `WHERE→PREWHERE` 优化。 [原始文章](https://clickhouse.com/docs/en/operations/settings/settings_users/) diff --git a/docs/zh/sql-reference/statements/alter/index.md b/docs/zh/sql-reference/statements/alter/index.md index 2f60dbb262e..f7d983cab4e 100644 --- a/docs/zh/sql-reference/statements/alter/index.md +++ b/docs/zh/sql-reference/statements/alter/index.md @@ -1,23 +1,74 @@ --- -toc_hidden_folder: true -toc_priority: 42 -toc_title: INDEX +toc_priority: 35 +toc_title: ALTER --- -# 操作数据跳过索引 {#manipulations-with-data-skipping-indices} +## ALTER {#query_language_queries_alter} -可以使用以下操作: +大多数 `ALTER TABLE` 查询修改表设置或数据: -- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。 +- [COLUMN](../../../sql-reference/statements/alter/column.md) +- [PARTITION](../../../sql-reference/statements/alter/partition.md) +- [DELETE](../../../sql-reference/statements/alter/delete.md) +- [UPDATE](../../../sql-reference/statements/alter/update.md) +- [ORDER BY](../../../sql-reference/statements/alter/order-by.md) +- [INDEX](../../../sql-reference/statements/alter/index/index.md) +- [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md) +- [TTL](../../../sql-reference/statements/alter/ttl.md) -- `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。 +!!! note "备注" + 大多数 `ALTER TABLE` 查询只支持[\*MergeTree](../../../engines/table-engines/mergetree-family/index.md)表,以及[Merge](../../../engines/table-engines/special/merge.md)和[Distributed](../../../engines/table-engines/special/distributed.md)。 -- `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../sql-reference/statements/alter/index.md#mutations). +这些 `ALTER` 语句操作视图: -前两个命令是轻量级的,它们只更改元数据或删除文件。 +- [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — 修改一个 [Materialized view](../create/view.md#materialized) 结构. +- [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — 刷新一个 [Live view](../create/view.md#live-view). -Also, they are replicated, syncing indices metadata via ZooKeeper. -此外,它们会被复制,会通过ZooKeeper同步索引元数据。 +这些 `ALTER` 语句修改与基于角色的访问控制相关的实体: -!!! note "注意" -索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../engines/table-engines/mergetree-family/replication.md)). +- [USER](../../../sql-reference/statements/alter/user.md) +- [ROLE](../../../sql-reference/statements/alter/role.md) +- [QUOTA](../../../sql-reference/statements/alter/quota.md) +- [ROW POLICY](../../../sql-reference/statements/alter/row-policy.md) +- [SETTINGS PROFILE](../../../sql-reference/statements/alter/settings-profile.md) + +[ALTER TABLE ... MODIFY COMMENT](../../../sql-reference/statements/alter/comment.md) 语句添加、修改或删除表中的注释,无论之前是否设置过。 + +## Mutations 突变 {#mutations} + +用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。 + + + +对于 `*MergeTree` 表,通过重写整个数据部分来执行突变。没有原子性——一旦突变的部件准备好,部件就会被替换,并且在突变期间开始执行的 `SELECT` 查询将看到来自已经突变的部件的数据,以及来自尚未突变的部件的数据。 + + + +突变完全按照它们的产生顺序排列,并按此顺序应用于每个部分。突变还与“INSERT INTO”查询进行部分排序:在提交突变之前插入表中的数据将被突变,而在此之后插入的数据将不会被突变。注意,突变不会以任何方式阻止插入。 + + + +突变查询在添加突变条目后立即返回(对于复制表到ZooKeeper,对于非复制表到文件系统)。突变本身使用系统配置文件设置异步执行。要跟踪突变的进程,可以使用[`system.mutations`](../../../operations/system-tables/mutations.md#system_tables-mutations) 表。成功提交的变异将继续执行,即使ClickHouse服务器重新启动。没有办法回滚突变一旦提交,但如果突变卡住了,它可以取消与[`KILL MUTATION`](../../../sql-reference/statements/misc.md#kill-mutation) 查询。 + + + +完成突变的条目不会立即删除(保留条目的数量由 `finished_mutations_to_keep` 存储引擎参数决定)。删除旧的突变条目。 + +## ALTER 查询的同步性 {#synchronicity-of-alter-queries} + + +对于非复制表,所有的 `ALTER` 查询都是同步执行的。对于复制表,查询只是向“ZooKeeper”添加相应动作的指令,动作本身会尽快执行。但是,查询可以等待所有副本上的这些操作完成。 + +对于所有的“ALTER”查询,您可以使用[replication_alter_partitions_sync](../../../operations/settings/settings.md#replication-alter-partitions-sync)设置等待。 + +通过[replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout]设置,可以指定不活动的副本执行所有 `ALTER` 查询的等待时间(以秒为单位)。 + + + +!!! info "备注" + + 对于所有的 `ALTER` 查询,如果 `replication_alter_partitions_sync = 2` 和一些副本的不激活时间超过时间(在 `replication_wait_for_inactive_replica_timeout` 设置中指定),那么将抛出一个异常 `UNFINISHED`。 + + + +对于 `ALTER TABLE ... UPDATE|DELETE` 查询由 [mutations_sync](../../../operations/settings/settings.md#mutations_sync) 设置定义的同步度。 diff --git a/docs/zh/sql-reference/statements/alter/index/index.md b/docs/zh/sql-reference/statements/alter/index/index.md deleted file mode 120000 index b754fa71b83..00000000000 --- a/docs/zh/sql-reference/statements/alter/index/index.md +++ /dev/null @@ -1 +0,0 @@ -../../../../../en/sql-reference/statements/alter/index/index.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/alter/index/index.md b/docs/zh/sql-reference/statements/alter/index/index.md new file mode 100644 index 00000000000..16f48e55b2f --- /dev/null +++ b/docs/zh/sql-reference/statements/alter/index/index.md @@ -0,0 +1,23 @@ +--- +toc_hidden_folder: true +toc_priority: 42 +toc_title: INDEX +--- + +# 操作数据跳过索引 {#manipulations-with-data-skipping-indices} + +可以使用以下操作: + +- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。 + +- `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。 + +- `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../../sql-reference/statements/alter/index.md#mutations). + +前两个命令是轻量级的,它们只更改元数据或删除文件。 + +Also, they are replicated, syncing indices metadata via ZooKeeper. +此外,它们会被复制,会通过ZooKeeper同步索引元数据。 + +!!! note "注意" + 索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../../engines/table-engines/mergetree-family/replication.md)). diff --git a/docs/zh/sql-reference/statements/exists.md b/docs/zh/sql-reference/statements/exists.md deleted file mode 120000 index d69e8224fe6..00000000000 --- a/docs/zh/sql-reference/statements/exists.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/sql-reference/statements/exists.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/exists.md b/docs/zh/sql-reference/statements/exists.md new file mode 100644 index 00000000000..69b26fea918 --- /dev/null +++ b/docs/zh/sql-reference/statements/exists.md @@ -0,0 +1,12 @@ +--- +toc_priority: 45 +toc_title: EXISTS +--- + +# EXISTS 语句 {#exists-statement} + +``` sql +EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] +``` + +返回一个单独的 `UInt8`类型的列,如果表或数据库不存在,则包含一个值 `0`,如果表在指定的数据库中存在,则包含一个值 `1`。 \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/set.md b/docs/zh/sql-reference/statements/set.md deleted file mode 120000 index 02e106afc9f..00000000000 --- a/docs/zh/sql-reference/statements/set.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/sql-reference/statements/set.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/set.md b/docs/zh/sql-reference/statements/set.md new file mode 100644 index 00000000000..a9888a7080e --- /dev/null +++ b/docs/zh/sql-reference/statements/set.md @@ -0,0 +1,23 @@ +--- +toc_priority: 50 +toc_title: SET +--- + +# SET 语句 {#query-set} + +``` sql +SET param = value +``` + +给当前会话的 `param` [配置项](../../operations/settings/index.md)赋值。你不能用这样的方式修改[服务器相关设置](../../operations/server-configuration-parameters/index.md)。 + + +您还可以在单个查询中设置指定设置配置文件中的所有值。 + + + +``` sql +SET profile = 'profile-name-from-the-settings-file' +``` + +更多详情, 详见 [配置项](../../operations/settings/settings.md). diff --git a/docs/zh/sql-reference/statements/use.md b/docs/zh/sql-reference/statements/use.md deleted file mode 120000 index 7bdbf049326..00000000000 --- a/docs/zh/sql-reference/statements/use.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/sql-reference/statements/use.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/use.md b/docs/zh/sql-reference/statements/use.md new file mode 100644 index 00000000000..41cba58bb9d --- /dev/null +++ b/docs/zh/sql-reference/statements/use.md @@ -0,0 +1,16 @@ +--- +toc_priority: 53 +toc_title: USE +--- + +# USE 语句 {#use} + +``` sql +USE db +``` + +用于设置会话的当前数据库。 + +如果查询语句中没有在表名前面以加点的方式指明数据库名, 则用当前数据库进行搜索。 + +使用 HTTP 协议时无法进行此查询,因为没有会话的概念。 diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index f897e9812a8..8906d186bfc 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -160,7 +160,7 @@ else() message(STATUS "ClickHouse keeper-converter mode: OFF") endif() -if(NOT (MAKE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES)) +if(NOT (USE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES)) set(CLICKHOUSE_ONE_SHARED ON) endif() diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index dd93e0b49ab..f8df823ecb7 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -364,7 +364,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "clickhouse-git-import", "clickhouse-compressor", "clickhouse-format", - "clickhouse-extract-from-config" + "clickhouse-extract-from-config", + "clickhouse-keeper", + "clickhouse-keeper-converter", }; for (const auto & tool : tools) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index d144b4d332e..636ce129d63 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -330,8 +330,6 @@ int Keeper::main(const std::vector & /*args*/) DB::ServerUUID::load(path + "/uuid", log); - const Settings & settings = global_context->getSettingsRef(); - std::string include_from_path = config().getString("include_from", "/etc/metrika.xml"); GlobalThreadPool::initialize( @@ -377,8 +375,8 @@ int Keeper::main(const std::vector & /*args*/) { Poco::Net::ServerSocket socket; auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); + socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC)); + socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC)); servers->emplace_back( listen_host, port_name, @@ -393,8 +391,8 @@ int Keeper::main(const std::vector & /*args*/) #if USE_SSL Poco::Net::SecureServerSocket socket; auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); + socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC)); + socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC)); servers->emplace_back( listen_host, secure_port_name, diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index a294857ace8..a6c9a6a4524 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -319,20 +320,26 @@ std::string LocalServer::getInitialCreateTableQuery() auto table_name = backQuoteIfNeed(config().getString("table-name", "table")); auto table_structure = config().getString("table-structure", "auto"); - auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV")); String table_file; + String format_from_file_name; if (!config().has("table-file") || config().getString("table-file") == "-") { /// Use Unix tools stdin naming convention table_file = "stdin"; + format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO); } else { /// Use regular file - table_file = quoteString(config().getString("table-file")); + auto file_name = config().getString("table-file"); + table_file = quoteString(file_name); + format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name, false); } + auto data_format + = backQuoteIfNeed(config().getString("table-data-format", format_from_file_name.empty() ? "TSV" : format_from_file_name)); + if (table_structure == "auto") table_structure = ""; else diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index a49ccc79b63..98838cb2337 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -967,6 +967,83 @@ if (ThreadFuzzer::instance().isEffective()) }, /* already_loaded = */ false); /// Reload it right now (initial loading) + const auto listen_hosts = getListenHosts(config()); + const auto listen_try = getListenTry(config()); + + if (config().has("keeper_server")) + { +#if USE_NURAFT + //// If we don't have configured connection probably someone trying to use clickhouse-server instead + //// of clickhouse-keeper, so start synchronously. + bool can_initialize_keeper_async = false; + + if (has_zookeeper) /// We have configured connection to some zookeeper cluster + { + /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start + /// synchronously. + can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); + } + /// Initialize keeper RAFT. + global_context->initializeKeeperDispatcher(can_initialize_keeper_async); + FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); + + for (const auto & listen_host : listen_hosts) + { + /// TCP Keeper + const char * port_name = "keeper_server.tcp_port"; + createServer( + config(), listen_host, port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC)); + socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC)); + return ProtocolServerAdapter( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); + }); + + const char * secure_port_name = "keeper_server.tcp_port_secure"; + createServer( + config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC)); + socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC)); + return ProtocolServerAdapter( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + } +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); +#endif + + } + + for (auto & server : servers_to_start_before_tables) + { + server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } + auto & access_control = global_context->getAccessControl(); if (config().has("custom_settings_prefixes")) access_control.setCustomSettingsPrefixes(config().getString("custom_settings_prefixes")); @@ -1075,83 +1152,6 @@ if (ThreadFuzzer::instance().isEffective()) /// try set up encryption. There are some errors in config, error will be printed and server wouldn't start. CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs"); - const auto listen_hosts = getListenHosts(config()); - const auto listen_try = getListenTry(config()); - - if (config().has("keeper_server")) - { -#if USE_NURAFT - //// If we don't have configured connection probably someone trying to use clickhouse-server instead - //// of clickhouse-keeper, so start synchronously. - bool can_initialize_keeper_async = false; - - if (has_zookeeper) /// We have configured connection to some zookeeper cluster - { - /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start - /// synchronously. - can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); - } - /// Initialize keeper RAFT. - global_context->initializeKeeperDispatcher(can_initialize_keeper_async); - FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - - for (const auto & listen_host : listen_hosts) - { - /// TCP Keeper - const char * port_name = "keeper_server.tcp_port"; - createServer( - config(), listen_host, port_name, listen_try, /* start_server: */ false, - servers_to_start_before_tables, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "Keeper (tcp): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); - }); - - const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer( - config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, - servers_to_start_before_tables, - [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - secure_port_name, - "Keeper with secure protocol (tcp_secure): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); -#else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - } -#else - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); -#endif - - } - - for (auto & server : servers_to_start_before_tables) - { - server.start(); - LOG_INFO(log, "Listening for {}", server.getDescription()); - } - SCOPE_EXIT({ /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart. diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 4472e975878..0ccf5e85624 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -145,14 +145,14 @@ enum class AccessType M(SYSTEM_RELOAD_EMBEDDED_DICTIONARIES, "RELOAD EMBEDDED DICTIONARIES", GLOBAL, SYSTEM_RELOAD) /* implicitly enabled by the grant SYSTEM_RELOAD_DICTIONARY ON *.* */\ M(SYSTEM_RELOAD, "", GROUP, SYSTEM) \ M(SYSTEM_RESTART_DISK, "SYSTEM RESTART DISK", GLOBAL, SYSTEM) \ - M(SYSTEM_MERGES, "SYSTEM STOP MERGES, SYSTEM START MERGES, STOP_MERGES, START MERGES", TABLE, SYSTEM) \ + M(SYSTEM_MERGES, "SYSTEM STOP MERGES, SYSTEM START MERGES, STOP MERGES, START MERGES", TABLE, SYSTEM) \ M(SYSTEM_TTL_MERGES, "SYSTEM STOP TTL MERGES, SYSTEM START TTL MERGES, STOP TTL MERGES, START TTL MERGES", TABLE, SYSTEM) \ M(SYSTEM_FETCHES, "SYSTEM STOP FETCHES, SYSTEM START FETCHES, STOP FETCHES, START FETCHES", TABLE, SYSTEM) \ M(SYSTEM_MOVES, "SYSTEM STOP MOVES, SYSTEM START MOVES, STOP MOVES, START MOVES", TABLE, SYSTEM) \ M(SYSTEM_DISTRIBUTED_SENDS, "SYSTEM STOP DISTRIBUTED SENDS, SYSTEM START DISTRIBUTED SENDS, STOP DISTRIBUTED SENDS, START DISTRIBUTED SENDS", TABLE, SYSTEM_SENDS) \ - M(SYSTEM_REPLICATED_SENDS, "SYSTEM STOP REPLICATED SENDS, SYSTEM START REPLICATED SENDS, STOP_REPLICATED_SENDS, START REPLICATED SENDS", TABLE, SYSTEM_SENDS) \ + M(SYSTEM_REPLICATED_SENDS, "SYSTEM STOP REPLICATED SENDS, SYSTEM START REPLICATED SENDS, STOP REPLICATED SENDS, START REPLICATED SENDS", TABLE, SYSTEM_SENDS) \ M(SYSTEM_SENDS, "SYSTEM STOP SENDS, SYSTEM START SENDS, STOP SENDS, START SENDS", GROUP, SYSTEM) \ - M(SYSTEM_REPLICATION_QUEUES, "SYSTEM STOP REPLICATION QUEUES, SYSTEM START REPLICATION QUEUES, STOP_REPLICATION_QUEUES, START REPLICATION QUEUES", TABLE, SYSTEM) \ + M(SYSTEM_REPLICATION_QUEUES, "SYSTEM STOP REPLICATION QUEUES, SYSTEM START REPLICATION QUEUES, STOP REPLICATION QUEUES, START REPLICATION QUEUES", TABLE, SYSTEM) \ M(SYSTEM_DROP_REPLICA, "DROP REPLICA", TABLE, SYSTEM) \ M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \ M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \ diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index eac761c1a82..347f4607dbf 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -70,11 +70,11 @@ static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types) AggregateFunctionPtr AggregateFunctionFactory::get( const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const { - auto type_without_low_cardinality = convertLowCardinalityTypesToNested(argument_types); + auto types_without_low_cardinality = convertLowCardinalityTypesToNested(argument_types); /// If one of the types is Nullable, we apply aggregate function combinator "Null". - if (std::any_of(type_without_low_cardinality.begin(), type_without_low_cardinality.end(), + if (std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(), [](const auto & type) { return type->isNullable(); })) { AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null"); @@ -82,10 +82,10 @@ AggregateFunctionPtr AggregateFunctionFactory::get( throw Exception("Logical error: cannot find aggregate function combinator to apply a function to Nullable arguments.", ErrorCodes::LOGICAL_ERROR); - DataTypes nested_types = combinator->transformArguments(type_without_low_cardinality); + DataTypes nested_types = combinator->transformArguments(types_without_low_cardinality); Array nested_parameters = combinator->transformParameters(parameters); - bool has_null_arguments = std::any_of(type_without_low_cardinality.begin(), type_without_low_cardinality.end(), + bool has_null_arguments = std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(), [](const auto & type) { return type->onlyNull(); }); AggregateFunctionPtr nested_function = getImpl( @@ -97,13 +97,10 @@ AggregateFunctionPtr AggregateFunctionFactory::get( // that are rewritten to AggregateFunctionNothing, in this case // nested_function is nullptr. if (!nested_function || !nested_function->isOnlyWindowFunction()) - { - return combinator->transformAggregateFunction(nested_function, - out_properties, type_without_low_cardinality, parameters); - } + return combinator->transformAggregateFunction(nested_function, out_properties, types_without_low_cardinality, parameters); } - auto with_original_arguments = getImpl(name, type_without_low_cardinality, parameters, out_properties, false); + auto with_original_arguments = getImpl(name, types_without_low_cardinality, parameters, out_properties, false); if (!with_original_arguments) throw Exception("Logical error: AggregateFunctionFactory returned nullptr", ErrorCodes::LOGICAL_ERROR); diff --git a/src/AggregateFunctions/AggregateFunctionIf.cpp b/src/AggregateFunctions/AggregateFunctionIf.cpp index d752900c018..ce71e76de43 100644 --- a/src/AggregateFunctions/AggregateFunctionIf.cpp +++ b/src/AggregateFunctions/AggregateFunctionIf.cpp @@ -40,28 +40,6 @@ public: } }; -/** Given an array of flags, checks if it's all zeros - * When the buffer is all zeros, this is slightly faster than doing a memcmp since doesn't require allocating memory - * When the buffer has values, this is much faster since it avoids visiting all memory (and the allocation and function calls) - */ -static bool ALWAYS_INLINE inline is_all_zeros(const UInt8 * flags, size_t size) -{ - size_t unroll_size = size - size % 8; - size_t i = 0; - while (i < unroll_size) - { - UInt64 v = *reinterpret_cast(&flags[i]); - if (v) - return false; - i += 8; - } - - for (; i < size; ++i) - if (flags[i]) - return false; - - return true; -} /** There are two cases: for single argument and variadic. * Code for single argument is much more efficient. @@ -73,6 +51,7 @@ class AggregateFunctionIfNullUnary final { private: size_t num_arguments; + bool filter_is_nullable = false; /// The name of the nested function, including combinators (i.e. *If) /// @@ -92,8 +71,26 @@ private: using Base = AggregateFunctionNullBase>; -public: + inline bool singleFilter(const IColumn ** columns, size_t row_num) const + { + const IColumn * filter_column = columns[num_arguments - 1]; + + if (filter_is_nullable) + { + const ColumnNullable * nullable_column = assert_cast(filter_column); + filter_column = nullable_column->getNestedColumnPtr().get(); + const UInt8 * filter_null_map = nullable_column->getNullMapData().data(); + + return assert_cast(*filter_column).getData()[row_num] && !filter_null_map[row_num]; + } + else + { + return assert_cast(*filter_column).getData()[row_num]; + } + } + +public: String getName() const override { return name; @@ -105,24 +102,17 @@ public: , name(name_) { if (num_arguments == 0) - throw Exception("Aggregate function " + getName() + " require at least one argument", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - } + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Aggregate function {} require at least one argument", getName()); - static inline bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments) - { - const IColumn * filter_column = columns[num_arguments - 1]; - if (const ColumnNullable * nullable_column = typeid_cast(filter_column)) - filter_column = nullable_column->getNestedColumnPtr().get(); - - return assert_cast(*filter_column).getData()[row_num]; + filter_is_nullable = arguments[num_arguments - 1]->isNullable(); } void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override { const ColumnNullable * column = assert_cast(columns[0]); const IColumn * nested_column = &column->getNestedColumn(); - if (!column->isNullAt(row_num) && singleFilter(columns, row_num, num_arguments)) + if (!column->isNullAt(row_num) && singleFilter(columns, row_num)) { this->setFlag(place); this->nested_function->add(this->nestedPlace(place), &nested_column, row_num, arena); @@ -136,29 +126,41 @@ public: const IColumn * columns_param[] = {&column->getNestedColumn()}; const IColumn * filter_column = columns[num_arguments - 1]; - if (const ColumnNullable * nullable_column = typeid_cast(filter_column)) - filter_column = nullable_column->getNestedColumnPtr().get(); - if constexpr (result_is_nullable) + + const UInt8 * filter_values = nullptr; + const UInt8 * filter_null_map = nullptr; + + if (filter_is_nullable) { - /// We need to check if there is work to do as otherwise setting the flag would be a mistake, - /// it would mean that the return value would be the default value of the nested type instead of NULL - if (is_all_zeros(assert_cast(filter_column)->getData().data(), batch_size)) - return; + const ColumnNullable * nullable_column = assert_cast(filter_column); + filter_column = nullable_column->getNestedColumnPtr().get(); + filter_null_map = nullable_column->getNullMapData().data(); } + filter_values = assert_cast(filter_column)->getData().data(); + /// Combine the 2 flag arrays so we can call a simplified version (one check vs 2) /// Note that now the null map will contain 0 if not null and not filtered, or 1 for null or filtered (or both) - const auto * filter_flags = assert_cast(filter_column)->getData().data(); + auto final_nulls = std::make_unique(batch_size); - for (size_t i = 0; i < batch_size; ++i) - final_nulls[i] = (!!null_map[i]) | (!filter_flags[i]); + + if (filter_null_map) + for (size_t i = 0; i < batch_size; ++i) + final_nulls[i] = (!!null_map[i]) | (!filter_values[i]) | (!!filter_null_map[i]); + else + for (size_t i = 0; i < batch_size; ++i) + final_nulls[i] = (!!null_map[i]) | (!filter_values[i]); + + if constexpr (result_is_nullable) + { + if (!memoryIsByte(final_nulls.get(), batch_size, 1)) + this->setFlag(place); + else + return; /// No work to do. + } this->nested_function->addBatchSinglePlaceNotNull( batch_size, this->nestedPlace(place), columns_param, final_nulls.get(), arena, -1); - - if constexpr (result_is_nullable) - if (!memoryIsByte(null_map, batch_size, 1)) - this->setFlag(place); } #if USE_EMBEDDED_COMPILER @@ -367,10 +369,14 @@ AggregateFunctionPtr AggregateFunctionIf::getOwnNullAdapter( const AggregateFunctionPtr & nested_function, const DataTypes & arguments, const Array & params, const AggregateFunctionProperties & properties) const { - bool return_type_is_nullable = !properties.returns_default_when_only_null && getReturnType()->canBeInsideNullable(); - size_t nullable_size = std::count_if(arguments.begin(), arguments.end(), [](const auto & element) { return element->isNullable(); }); - return_type_is_nullable &= nullable_size != 1 || !arguments.back()->isNullable(); /// If only condition is nullable. we should non-nullable type. - bool serialize_flag = return_type_is_nullable || properties.returns_default_when_only_null; + assert(!arguments.empty()); + + /// Nullability of the last argument (condition) does not affect the nullability of the result (NULL is processed as false). + /// For other arguments it is as usual (at least one is NULL then the result is NULL if possible). + bool return_type_is_nullable = !properties.returns_default_when_only_null && getReturnType()->canBeInsideNullable() + && std::any_of(arguments.begin(), arguments.end() - 1, [](const auto & element) { return element->isNullable(); }); + + bool need_to_serialize_flag = return_type_is_nullable || properties.returns_default_when_only_null; if (arguments.size() <= 2 && arguments.front()->isNullable()) { @@ -380,7 +386,7 @@ AggregateFunctionPtr AggregateFunctionIf::getOwnNullAdapter( } else { - if (serialize_flag) + if (need_to_serialize_flag) return std::make_shared>(nested_function->getName(), nested_func, arguments, params); else return std::make_shared>(nested_function->getName(), nested_func, arguments, params); @@ -394,7 +400,7 @@ AggregateFunctionPtr AggregateFunctionIf::getOwnNullAdapter( } else { - if (serialize_flag) + if (need_to_serialize_flag) return std::make_shared>(nested_function, arguments, params); else return std::make_shared>(nested_function, arguments, params); diff --git a/src/AggregateFunctions/AggregateFunctionSimpleState.h b/src/AggregateFunctions/AggregateFunctionSimpleState.h index d32d9a4f806..d63d8b71b8c 100644 --- a/src/AggregateFunctions/AggregateFunctionSimpleState.h +++ b/src/AggregateFunctions/AggregateFunctionSimpleState.h @@ -17,15 +17,11 @@ class AggregateFunctionSimpleState final : public IAggregateFunctionHelper(arguments_, params_) , nested_func(nested_) - , arguments(arguments_) - , params(params_) { } @@ -35,18 +31,19 @@ public: { DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(nested_func); - // Need to make a clone because it'll be customized. - auto storage_type = DataTypeFactory::instance().get(nested_func->getReturnType()->getName()); - + // Need to make a clone to avoid recursive reference. + auto storage_type_out = DataTypeFactory::instance().get(nested_func->getReturnType()->getName()); // Need to make a new function with promoted argument types because SimpleAggregates requires arg_type = return_type. AggregateFunctionProperties properties; auto function - = AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type}, nested_func->getParameters(), properties); + = AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type_out}, nested_func->getParameters(), properties); + // Need to make a clone because it'll be customized. + auto storage_type_arg = DataTypeFactory::instance().get(nested_func->getReturnType()->getName()); DataTypeCustomNamePtr custom_name - = std::make_unique(function, DataTypes{nested_func->getReturnType()}, params); - storage_type->setCustomization(std::make_unique(std::move(custom_name), nullptr)); - return storage_type; + = std::make_unique(function, DataTypes{nested_func->getReturnType()}, parameters); + storage_type_arg->setCustomization(std::make_unique(std::move(custom_name), nullptr)); + return storage_type_arg; } bool isVersioned() const override diff --git a/src/AggregateFunctions/AggregateFunctionState.h b/src/AggregateFunctions/AggregateFunctionState.h index 98fcfa83d67..f4f55835c93 100644 --- a/src/AggregateFunctions/AggregateFunctionState.h +++ b/src/AggregateFunctions/AggregateFunctionState.h @@ -20,13 +20,12 @@ class AggregateFunctionState final : public IAggregateFunctionHelper(arguments_, params_) - , nested_func(nested_), arguments(arguments_), params(params_) {} + , nested_func(nested_) + {} String getName() const override { diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 7e661a92c5b..295258cd8cf 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -226,7 +226,7 @@ public: { // FIXME why is storing NearestFieldType not enough, and we // have to check for decimals again here? - UInt32 scale = static_cast &>(key_column).getData().getScale(); + UInt32 scale = static_cast &>(key_column).getScale(); it = merged_maps.find(DecimalField(key, scale)); } else @@ -251,7 +251,7 @@ public: if constexpr (is_decimal) { - UInt32 scale = static_cast &>(key_column).getData().getScale(); + UInt32 scale = static_cast &>(key_column).getScale(); merged_maps.emplace(DecimalField(key, scale), std::move(new_values)); } else diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3432f97b168..1fbb29868b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,7 +11,7 @@ if(COMPILER_PIPE) else() set(MAX_COMPILER_MEMORY 1500) endif() -if(MAKE_STATIC_LIBRARIES) +if(USE_STATIC_LIBRARIES) set(MAX_LINKER_MEMORY 3500) else() set(MAX_LINKER_MEMORY 2500) @@ -193,7 +193,7 @@ add_subdirectory(Common/Config) set (all_modules) macro(add_object_library name common_path) - if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) + if (USE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) add_headers_and_sources(dbms ${common_path}) else () list (APPEND all_modules ${name}) @@ -253,15 +253,10 @@ if (TARGET ch_contrib::nuraft) endif() set (DBMS_COMMON_LIBRARIES) -# libgcc_s does not provide an implementation of an atomics library. Instead, -# GCC’s libatomic library can be used to supply these when using libgcc_s. -if ((NOT USE_LIBCXX) AND COMPILER_CLANG AND OS_LINUX) - list (APPEND DBMS_COMMON_LIBRARIES atomic) -endif() -if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) +if (USE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) add_library (dbms STATIC ${dbms_headers} ${dbms_sources}) - target_link_libraries (dbms PRIVATE libdivide ${DBMS_COMMON_LIBRARIES}) + target_link_libraries (dbms PRIVATE ch_contrib::libdivide ${DBMS_COMMON_LIBRARIES}) if (TARGET ch_contrib::jemalloc) target_link_libraries (dbms PRIVATE ch_contrib::jemalloc) endif() @@ -269,7 +264,7 @@ if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) else() add_library (dbms SHARED ${dbms_headers} ${dbms_sources}) target_link_libraries (dbms PUBLIC ${all_modules} ${DBMS_COMMON_LIBRARIES}) - target_link_libraries (clickhouse_interpreters PRIVATE libdivide) + target_link_libraries (clickhouse_interpreters PRIVATE ch_contrib::libdivide) if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_interpreters PRIVATE ch_contrib::jemalloc) endif() @@ -316,7 +311,7 @@ target_link_libraries (clickhouse_common_io PUBLIC common ch_contrib::double_conversion - dragonbox_to_chars + ch_contrib::dragonbox_to_chars ) # Use X86 AVX2/AVX512 instructions to accelerate filter operations @@ -325,6 +320,7 @@ set_source_files_properties( Columns/ColumnsCommon.cpp Columns/ColumnVector.cpp Columns/ColumnDecimal.cpp + Columns/ColumnString.cpp PROPERTIES COMPILE_FLAGS "${X86_INTRINSICS_FLAGS}") target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::re2_st) @@ -346,9 +342,11 @@ if (TARGET ch_contrib::cpuid) target_link_libraries(clickhouse_common_io PRIVATE ch_contrib::cpuid) endif() +dbms_target_link_libraries(PUBLIC ch_contrib::abseil_swiss_tables) + # Make dbms depend on roaring instead of clickhouse_common_io so that roaring itself can depend on clickhouse_common_io # That way we we can redirect malloc/free functions avoiding circular dependencies -dbms_target_link_libraries(PUBLIC roaring) +dbms_target_link_libraries(PUBLIC ch_contrib::roaring) if (TARGET ch_contrib::rdkafka) dbms_target_link_libraries(PRIVATE ch_contrib::rdkafka ch_contrib::cppkafka) @@ -508,6 +506,7 @@ if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::stemmer) dbms_target_link_libraries (PUBLIC ch_contrib::wnb) dbms_target_link_libraries (PUBLIC ch_contrib::lemmagen) + dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data) endif() if (TARGET ch_contrib::bzip2) @@ -522,7 +521,7 @@ if (TARGET ch_contrib::rapidjson) dbms_target_link_libraries(PRIVATE ch_contrib::rapidjson) endif() -dbms_target_link_libraries(PUBLIC consistent-hashing) +dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") @@ -568,3 +567,4 @@ if (ENABLE_TESTS) add_check(unit_tests_dbms) endif () + diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 766cd09883a..840ed3d1012 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -552,6 +553,25 @@ void ClientBase::initLogsOutputStream() } } +void ClientBase::updateSuggest(const ASTCreateQuery & ast_create) +{ + std::vector new_words; + + if (ast_create.database) + new_words.push_back(ast_create.getDatabase()); + new_words.push_back(ast_create.getTable()); + + if (ast_create.columns_list && ast_create.columns_list->columns) + { + for (const auto & elem : ast_create.columns_list->columns->children) + { + if (const auto * column = elem->as()) + new_words.push_back(column->name); + } + } + + suggest->addWords(std::move(new_words)); +} void ClientBase::processTextAsSingleQuery(const String & full_query) { @@ -565,6 +585,18 @@ void ClientBase::processTextAsSingleQuery(const String & full_query) String query_to_execute; + /// Query will be parsed before checking the result because error does not + /// always means a problem, i.e. if table already exists, and it is no a + /// huge problem if suggestion will be added even on error, since this is + /// just suggestion. + if (auto * create = parsed_query->as()) + { + /// Do not update suggest, until suggestion will be ready + /// (this will avoid extra complexity) + if (suggest) + updateSuggest(*create); + } + // An INSERT query may have the data that follow query text. Remove the /// Send part of query without data, because data will be sent separately. auto * insert = parsed_query->as(); @@ -1464,7 +1496,6 @@ void ClientBase::runInteractive() /// Initialize DateLUT here to avoid counting time spent here as query execution time. const auto local_tz = DateLUT::instance().getTimeZone(); - std::optional suggest; suggest.emplace(); if (load_suggestions) { diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 1926df5afea..89e0770182b 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -136,6 +136,8 @@ private: void readArguments(int argc, char ** argv, Arguments & common_arguments, std::vector & external_tables_arguments); void parseAndCheckOptions(OptionsDescription & options_description, po::variables_map & options, Arguments & arguments); + void updateSuggest(const ASTCreateQuery & ast_create); + protected: bool is_interactive = false; /// Use either interactive line editing interface or batch mode. bool is_multiquery = false; @@ -144,6 +146,8 @@ protected: bool echo_queries = false; /// Print queries before execution in batch mode. bool ignore_error = false; /// In case of errors, don't print error message, continue to next query. Only applicable for non-interactive mode. bool print_time_to_stderr = false; /// Output execution time to stderr in batch mode. + + std::optional suggest; bool load_suggestions = false; std::vector queries_files; /// If not empty, queries will be read from these files diff --git a/src/Client/Suggest.cpp b/src/Client/Suggest.cpp index b14af7ba8e9..738c98d2119 100644 --- a/src/Client/Suggest.cpp +++ b/src/Client/Suggest.cpp @@ -29,19 +29,21 @@ namespace ErrorCodes Suggest::Suggest() { /// Keywords may be not up to date with ClickHouse parser. - words = {"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT", - "MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP", - "RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT", - "PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO", - "OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE", - "END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES", - "SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER", - "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY", - "WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC", - "IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE", - "PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE", - "IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED", - "INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE"}; + addWords({ + "CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT", + "MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP", + "RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT", + "PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO", + "OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE", + "END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES", + "SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER", + "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY", + "WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC", + "IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE", + "PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE", + "IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED", + "INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE", + }); } static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion) @@ -124,18 +126,6 @@ void Suggest::load(ContextPtr context, const ConnectionParameters & connection_p } /// Note that keyword suggestions are available even if we cannot load data from server. - - std::sort(words.begin(), words.end()); - words_no_case = words; - std::sort(words_no_case.begin(), words_no_case.end(), [](const std::string & str1, const std::string & str2) - { - return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char char1, const char char2) - { - return std::tolower(char1) < std::tolower(char2); - }); - }); - - ready = true; }); } @@ -190,8 +180,14 @@ void Suggest::fillWordsFromBlock(const Block & block) const ColumnString & column = typeid_cast(*block.getByPosition(0).column); size_t rows = block.rows(); + + Words new_words; + new_words.reserve(rows); for (size_t i = 0; i < rows; ++i) - words.emplace_back(column.getDataAt(i).toString()); + { + new_words.emplace_back(column.getDataAt(i).toString()); + } + addWords(std::move(new_words)); } template diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index c3d326c816f..7a3d2052d4a 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1,5 +1,3 @@ -#include // memcpy - #include #include #include @@ -9,12 +7,7 @@ #include #include #include - -#include -#include - #include - #include #include #include @@ -22,6 +15,8 @@ #include #include #include +#include +#include // memcpy namespace DB @@ -127,18 +122,8 @@ size_t ColumnArray::size() const Field ColumnArray::operator[](size_t n) const { - size_t offset = offsetAt(n); - size_t size = sizeAt(n); - - if (size > max_array_size_as_field) - throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}", - size, max_array_size_as_field); - - Array res(size); - - for (size_t i = 0; i < size; ++i) - res[i] = getData()[offset + i]; - + Field res; + get(n, res); return res; } @@ -152,11 +137,12 @@ void ColumnArray::get(size_t n, Field & res) const throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}", size, max_array_size_as_field); - res = Array(size); + res = Array(); Array & res_arr = DB::get(res); + res_arr.reserve(size); for (size_t i = 0; i < size; ++i) - getData().get(offset + i, res_arr[i]); + res_arr.push_back(getData()[offset + i]); } diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index 99085f0f976..4941585f8dd 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -32,12 +32,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -template class DecimalPaddedPODArray; -template class DecimalPaddedPODArray; -template class DecimalPaddedPODArray; -template class DecimalPaddedPODArray; -template class DecimalPaddedPODArray; - template int ColumnDecimal::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const { @@ -131,19 +125,6 @@ void ColumnDecimal::updateHashFast(SipHash & hash) const template void ColumnDecimal::getPermutation(bool reverse, size_t limit, int , IColumn::Permutation & res) const { -#if 1 /// TODO: perf test - if (data.size() <= std::numeric_limits::max()) - { - PaddedPODArray tmp_res; - permutation(reverse, limit, tmp_res); - - res.resize(tmp_res.size()); - for (size_t i = 0; i < tmp_res.size(); ++i) - res[i] = tmp_res[i]; - return; - } -#endif - permutation(reverse, limit, res); } diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index b55083cd671..1a4b06b46e4 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -1,66 +1,21 @@ #pragma once +#include + +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include -#include -#include - -#include namespace DB { -/// PaddedPODArray extended by Decimal scale -template -class DecimalPaddedPODArray : public PaddedPODArray -{ -public: - using Base = PaddedPODArray; - using Base::operator[]; - - DecimalPaddedPODArray(size_t size, UInt32 scale_) - : Base(size), - scale(scale_) - {} - - DecimalPaddedPODArray(const DecimalPaddedPODArray & other) - : Base(other.begin(), other.end()), - scale(other.scale) - {} - - DecimalPaddedPODArray(DecimalPaddedPODArray && other) - { - this->swap(other); - std::swap(scale, other.scale); - } - - DecimalPaddedPODArray & operator=(DecimalPaddedPODArray && other) - { - this->swap(other); - std::swap(scale, other.scale); - return *this; - } - - UInt32 getScale() const { return scale; } - -private: - UInt32 scale; -}; - -/// Prevent implicit template instantiation of DecimalPaddedPODArray for common decimal types - -extern template class DecimalPaddedPODArray; -extern template class DecimalPaddedPODArray; -extern template class DecimalPaddedPODArray; -extern template class DecimalPaddedPODArray; -extern template class DecimalPaddedPODArray; - /// A ColumnVector for Decimals template class ColumnDecimal final : public COWHelper> @@ -72,16 +27,16 @@ private: public: using ValueType = T; using NativeT = typename T::NativeType; - using Container = DecimalPaddedPODArray; + using Container = PaddedPODArray; private: ColumnDecimal(const size_t n, UInt32 scale_) - : data(n, scale_), + : data(n), scale(scale_) {} ColumnDecimal(const ColumnDecimal & src) - : data(src.data), + : data(src.data.begin(), src.data.end()), scale(src.scale) {} @@ -195,7 +150,7 @@ public: const T & getElement(size_t n) const { return data[n]; } T & getElement(size_t n) { return data[n]; } - UInt32 getScale() const {return scale;} + UInt32 getScale() const { return scale; } protected: Container data; @@ -206,8 +161,8 @@ protected: { size_t s = data.size(); res.resize(s); - for (U i = 0; i < s; ++i) - res[i] = i; + for (size_t i = 0; i < s; ++i) + res[i] = static_cast(i); auto sort_end = res.end(); if (limit && limit < s) diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index e595525d9e8..ef5d96da0f7 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include #include #include @@ -64,8 +62,9 @@ MutableColumnPtr ColumnMap::cloneResized(size_t new_size) const Field ColumnMap::operator[](size_t n) const { - auto array = DB::get((*nested)[n]); - return Map(std::make_move_iterator(array.begin()), std::make_move_iterator(array.end())); + Field res; + get(n, res); + return res; } void ColumnMap::get(size_t n, Field & res) const @@ -74,11 +73,12 @@ void ColumnMap::get(size_t n, Field & res) const size_t offset = offsets[n - 1]; size_t size = offsets[n] - offsets[n - 1]; - res = Map(size); + res = Map(); auto & map = DB::get(res); + map.reserve(size); for (size_t i = 0; i < size; ++i) - getNestedData().get(offset + i, map[i]); + map.push_back(getNestedData()[offset + i]); } bool ColumnMap::isDefaultAt(size_t n) const diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index d667b264d55..0310eca7adc 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -9,9 +9,6 @@ #include #include #include -#include -#include -#include #include @@ -101,17 +98,21 @@ MutableColumnPtr ColumnTuple::cloneResized(size_t new_size) const Field ColumnTuple::operator[](size_t n) const { - return collections::map(columns, [n] (const auto & column) { return (*column)[n]; }); + Field res; + get(n, res); + return res; } void ColumnTuple::get(size_t n, Field & res) const { const size_t tuple_size = columns.size(); - Tuple tuple(tuple_size); - for (const auto i : collections::range(0, tuple_size)) - columns[i]->get(n, tuple[i]); - res = tuple; + res = Tuple(); + Tuple & res_tuple = DB::get(res); + res_tuple.reserve(tuple_size); + + for (size_t i = 0; i < tuple_size; ++i) + res_tuple.push_back((*columns[i])[n]); } bool ColumnTuple::isDefaultAt(size_t n) const @@ -483,7 +484,7 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const Tuple min_tuple(tuple_size); Tuple max_tuple(tuple_size); - for (const auto i : collections::range(0, tuple_size)) + for (size_t i = 0; i < tuple_size; ++i) columns[i]->getExtremes(min_tuple[i], max_tuple[i]); min = min_tuple; @@ -504,7 +505,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const if (tuple_size != rhs_tuple->columns.size()) return false; - for (const auto i : collections::range(0, tuple_size)) + for (size_t i = 0; i < tuple_size; ++i) if (!columns[i]->structureEquals(*rhs_tuple->columns[i])) return false; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 4fdd4fd8142..2e68ec3e3cb 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -609,7 +609,8 @@ M(638, SNAPPY_UNCOMPRESS_FAILED) \ M(639, SNAPPY_COMPRESS_FAILED) \ M(640, NO_HIVEMETASTORE) \ - M(641, NUMBER_OF_DIMENSIONS_MISMATHED) \ + M(641, CANNOT_APPEND_TO_FILE) \ + M(642, NUMBER_OF_DIMENSIONS_MISMATHED) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/FrequencyHolder.h b/src/Common/FrequencyHolder.h new file mode 100644 index 00000000000..a98ae0452d3 --- /dev/null +++ b/src/Common/FrequencyHolder.h @@ -0,0 +1,252 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int FILE_DOESNT_EXIST; +} + +/// FrequencyHolder class is responsible for storing and loading dictionaries +/// needed for text classification functions: +/// +/// 1. detectLanguageUnknown +/// 2. detectCharset +/// 3. detectTonality +/// 4. detectProgrammingLanguage + +class FrequencyHolder +{ + +public: + struct Language + { + String name; + HashMap map; + }; + + struct Encoding + { + String name; + String lang; + HashMap map; + }; + +public: + using Map = HashMap; + using Container = std::vector; + using EncodingMap = HashMap; + using EncodingContainer = std::vector; + + static FrequencyHolder & getInstance() + { + static FrequencyHolder instance; + return instance; + } + + void loadEncodingsFrequency() + { + Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency"); + + LOG_TRACE(log, "Loading embedded charset frequencies"); + + auto resource = getResource("charset.zst"); + if (resource.empty()) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies"); + + String line; + UInt16 bigram; + Float64 frequency; + String charset_name; + + auto buf = std::make_unique(resource.data(), resource.size()); + ZstdInflatingReadBuffer in(std::move(buf)); + + while (!in.eof()) + { + readString(line, in); + in.ignore(); + + if (line.empty()) + continue; + + ReadBufferFromString buf_line(line); + + // Start loading a new charset + if (line.starts_with("// ")) + { + // Skip "// " + buf_line.ignore(3); + readString(charset_name, buf_line); + + /* In our dictionary we have lines with form: _ + * If we need to find language of data, we return + * If we need to find charset of data, we return . + */ + size_t sep = charset_name.find('_'); + + Encoding enc; + enc.lang = charset_name.substr(0, sep); + enc.name = charset_name.substr(sep + 1); + encodings_freq.push_back(std::move(enc)); + } + else + { + readIntText(bigram, buf_line); + buf_line.ignore(); + readFloatText(frequency, buf_line); + + encodings_freq.back().map[bigram] = frequency; + } + } + LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size()); + } + + + void loadEmotionalDict() + { + Poco::Logger * log = &Poco::Logger::get("EmotionalDict"); + LOG_TRACE(log, "Loading embedded emotional dictionary"); + + auto resource = getResource("tonality_ru.zst"); + if (resource.empty()) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary"); + + String line; + String word; + Float64 tonality; + size_t count = 0; + + auto buf = std::make_unique(resource.data(), resource.size()); + ZstdInflatingReadBuffer in(std::move(buf)); + + while (!in.eof()) + { + readString(line, in); + in.ignore(); + + if (line.empty()) + continue; + + ReadBufferFromString buf_line(line); + + readStringUntilWhitespace(word, buf_line); + buf_line.ignore(); + readFloatText(tonality, buf_line); + + StringRef ref{string_pool.insert(word.data(), word.size()), word.size()}; + emotional_dict[ref] = tonality; + ++count; + } + LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count)); + } + + + void loadProgrammingFrequency() + { + Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency"); + + LOG_TRACE(log, "Loading embedded programming languages frequencies loading"); + + auto resource = getResource("programming.zst"); + if (resource.empty()) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies"); + + String line; + String bigram; + Float64 frequency; + String programming_language; + + auto buf = std::make_unique(resource.data(), resource.size()); + ZstdInflatingReadBuffer in(std::move(buf)); + + while (!in.eof()) + { + readString(line, in); + in.ignore(); + + if (line.empty()) + continue; + + ReadBufferFromString buf_line(line); + + // Start loading a new language + if (line.starts_with("// ")) + { + // Skip "// " + buf_line.ignore(3); + readString(programming_language, buf_line); + + Language lang; + lang.name = programming_language; + programming_freq.push_back(std::move(lang)); + } + else + { + readStringUntilWhitespace(bigram, buf_line); + buf_line.ignore(); + readFloatText(frequency, buf_line); + + StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()}; + programming_freq.back().map[ref] = frequency; + } + } + LOG_TRACE(log, "Programming languages frequencies was added"); + } + + const Map & getEmotionalDict() + { + std::lock_guard lock(mutex); + if (emotional_dict.empty()) + loadEmotionalDict(); + + return emotional_dict; + } + + + const EncodingContainer & getEncodingsFrequency() + { + std::lock_guard lock(mutex); + if (encodings_freq.empty()) + loadEncodingsFrequency(); + + return encodings_freq; + } + + const Container & getProgrammingFrequency() + { + std::lock_guard lock(mutex); + if (programming_freq.empty()) + loadProgrammingFrequency(); + + return programming_freq; + } + + +private: + Arena string_pool; + + Map emotional_dict; + Container programming_freq; + EncodingContainer encodings_freq; + + std::mutex mutex; +}; +} diff --git a/src/Common/IntervalTree.h b/src/Common/IntervalTree.h index dc2987247d8..608bd5cf12c 100644 --- a/src/Common/IntervalTree.h +++ b/src/Common/IntervalTree.h @@ -291,6 +291,15 @@ public: size_t getIntervalsSize() const { return intervals_size; } + size_t getSizeInBytes() const + { + size_t nodes_size_in_bytes = nodes.size() * sizeof(Node); + size_t intervals_size_in_bytes = sorted_intervals.size() * sizeof(IntervalWithValue); + size_t result = nodes_size_in_bytes + intervals_size_in_bytes; + + return result; + } + private: struct Node { diff --git a/src/Common/LockMemoryExceptionInThread.h b/src/Common/LockMemoryExceptionInThread.h index dc2bccf257b..ec8f69806d7 100644 --- a/src/Common/LockMemoryExceptionInThread.h +++ b/src/Common/LockMemoryExceptionInThread.h @@ -1,5 +1,6 @@ #pragma once +#include #include /// To be able to avoid MEMORY_LIMIT_EXCEEDED Exception in destructors: diff --git a/src/Common/MemoryTrackerBlockerInThread.h b/src/Common/MemoryTrackerBlockerInThread.h index caad28f636e..381eb80df0c 100644 --- a/src/Common/MemoryTrackerBlockerInThread.h +++ b/src/Common/MemoryTrackerBlockerInThread.h @@ -1,5 +1,6 @@ #pragma once +#include #include /// To be able to temporarily stop memory tracking from current thread. diff --git a/src/Common/PoolBase.h b/src/Common/PoolBase.h index 85d4e84abca..a82a6efc4c1 100644 --- a/src/Common/PoolBase.h +++ b/src/Common/PoolBase.h @@ -41,6 +41,7 @@ private: ObjectPtr object; bool in_use = false; + std::atomic is_expired = false; PoolBase & pool; }; @@ -87,6 +88,14 @@ public: Object & operator*() & { return *data->data.object; } const Object & operator*() const & { return *data->data.object; } + /** + * Expire an object to make it reallocated later. + */ + void expire() + { + data->data.is_expired = true; + } + bool isNull() const { return data == nullptr; } PoolBase * getPool() const @@ -112,9 +121,22 @@ public: while (true) { for (auto & item : items) + { if (!item->in_use) - return Entry(*item); - + { + if (likely(!item->is_expired)) + { + return Entry(*item); + } + else + { + expireObject(item->object); + item->object = allocObject(); + item->is_expired = false; + return Entry(*item); + } + } + } if (items.size() < max_items) { ObjectPtr object = allocObject(); @@ -139,6 +161,12 @@ public: items.emplace_back(std::make_shared(allocObject(), *this)); } + inline size_t size() + { + std::unique_lock lock(mutex); + return items.size(); + } + private: /** The maximum size of the pool. */ unsigned max_items; @@ -162,4 +190,5 @@ protected: /** Creates a new object to put into the pool. */ virtual ObjectPtr allocObject() = 0; + virtual void expireObject(ObjectPtr) {} }; diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h index f34bc6f7322..40629838284 100644 --- a/src/Common/StringSearcher.h +++ b/src/Common/StringSearcher.h @@ -24,7 +24,6 @@ namespace DB namespace ErrorCodes { - extern const int UNSUPPORTED_PARAMETER; extern const int BAD_ARGUMENTS; } @@ -34,9 +33,12 @@ namespace ErrorCodes */ -struct StringSearcherBase +class StringSearcherBase { +public: + bool force_fallback = false; #ifdef __SSE2__ +protected: static constexpr auto n = sizeof(__m128i); const int page_size = ::getPageSize(); @@ -53,7 +55,7 @@ template class StringSearcher; /// Case-insensitive UTF-8 searcher template <> -class StringSearcher : private StringSearcherBase +class StringSearcher : public StringSearcherBase { private: using UTF8SequenceBuffer = uint8_t[6]; @@ -119,11 +121,14 @@ public: size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq)); if (length_l != length_u) - throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER}; + force_fallback = true; } l = l_seq[0]; u = u_seq[0]; + + if (force_fallback) + return; } #ifdef __SSE4_1__ @@ -158,7 +163,10 @@ public: /// @note Unicode standard states it is a rare but possible occasion if (!(dst_l_len == dst_u_len && dst_u_len == src_len)) - throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER}; + { + force_fallback = true; + return; + } } cache_actual_len += src_len; @@ -199,9 +207,10 @@ public: if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point)) break; - /// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true) - const auto len = UTF8::seqLength(*haystack_pos); + auto len = UTF8::seqLength(*haystack_pos); haystack_pos += len; + + len = UTF8::seqLength(*needle_pos); needle_pos += len; } @@ -213,7 +222,7 @@ public: { #ifdef __SSE4_1__ - if (pageSafe(pos)) + if (pageSafe(pos) && !force_fallback) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -262,7 +271,7 @@ public: while (haystack < haystack_end) { #ifdef __SSE4_1__ - if (haystack + n <= haystack_end && pageSafe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); @@ -339,7 +348,7 @@ public: /// Case-insensitive ASCII searcher template <> -class StringSearcher : private StringSearcherBase +class StringSearcher : public StringSearcherBase { private: /// string to be searched for @@ -541,7 +550,7 @@ public: /// Case-sensitive searcher (both ASCII and UTF-8) template -class StringSearcher : private StringSearcherBase +class StringSearcher : public StringSearcherBase { private: /// string to be searched for @@ -725,7 +734,7 @@ public: // Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings // should work just fine. But any Unicode whitespace is not considered a token separtor. template -class TokenSearcher +class TokenSearcher : public StringSearcherBase { StringSearcher searcher; size_t needle_size; @@ -809,7 +818,7 @@ using ASCIICaseInsensitiveTokenSearcher = TokenSearcher +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TIMEOUT_EXCEEDED; +} + +namespace +{ + constexpr size_t DBMS_SYSTEM_LOG_QUEUE_SIZE = 1048576; +} + +void ISystemLog::stopFlushThread() +{ + { + std::lock_guard lock(mutex); + + if (!saving_thread.joinable()) + { + return; + } + + if (is_shutdown) + { + return; + } + + is_shutdown = true; + + /// Tell thread to shutdown. + flush_event.notify_all(); + } + + saving_thread.join(); +} + +void ISystemLog::startup() +{ + std::lock_guard lock(mutex); + saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); }); +} + +static thread_local bool recursive_add_call = false; + +template +void SystemLogBase::add(const LogElement & element) +{ + /// It is possible that the method will be called recursively. + /// Better to drop these events to avoid complications. + if (recursive_add_call) + return; + recursive_add_call = true; + SCOPE_EXIT({ recursive_add_call = false; }); + + /// Memory can be allocated while resizing on queue.push_back. + /// The size of allocation can be in order of a few megabytes. + /// But this should not be accounted for query memory usage. + /// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky. + MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); + + /// Should not log messages under mutex. + bool queue_is_half_full = false; + + { + std::unique_lock lock(mutex); + + if (is_shutdown) + return; + + if (queue.size() == DBMS_SYSTEM_LOG_QUEUE_SIZE / 2) + { + queue_is_half_full = true; + + // The queue more than half full, time to flush. + // We only check for strict equality, because messages are added one + // by one, under exclusive lock, so we will see each message count. + // It is enough to only wake the flushing thread once, after the message + // count increases past half available size. + const uint64_t queue_end = queue_front_index + queue.size(); + if (requested_flush_up_to < queue_end) + requested_flush_up_to = queue_end; + + flush_event.notify_all(); + } + + if (queue.size() >= DBMS_SYSTEM_LOG_QUEUE_SIZE) + { + // Ignore all further entries until the queue is flushed. + // Log a message about that. Don't spam it -- this might be especially + // problematic in case of trace log. Remember what the front index of the + // queue was when we last logged the message. If it changed, it means the + // queue was flushed, and we can log again. + if (queue_front_index != logged_queue_full_at_index) + { + logged_queue_full_at_index = queue_front_index; + + // TextLog sets its logger level to 0, so this log is a noop and + // there is no recursive logging. + lock.unlock(); + LOG_ERROR(log, "Queue is full for system log '{}' at {}", demangle(typeid(*this).name()), queue_front_index); + } + + return; + } + + queue.push_back(element); + } + + if (queue_is_half_full) + LOG_INFO(log, "Queue is half full for system log '{}'.", demangle(typeid(*this).name())); +} + +template +void SystemLogBase::flush(bool force) +{ + uint64_t this_thread_requested_offset; + + { + std::unique_lock lock(mutex); + + if (is_shutdown) + return; + + this_thread_requested_offset = queue_front_index + queue.size(); + + // Publish our flush request, taking care not to overwrite the requests + // made by other threads. + is_force_prepare_tables |= force; + requested_flush_up_to = std::max(requested_flush_up_to, this_thread_requested_offset); + + flush_event.notify_all(); + } + + LOG_DEBUG(log, "Requested flush up to offset {}", this_thread_requested_offset); + + // Use an arbitrary timeout to avoid endless waiting. 60s proved to be + // too fast for our parallel functional tests, probably because they + // heavily load the disk. + const int timeout_seconds = 180; + std::unique_lock lock(mutex); + bool result = flush_event.wait_for(lock, std::chrono::seconds(timeout_seconds), [&] + { + return flushed_up_to >= this_thread_requested_offset && !is_force_prepare_tables; + }); + + if (!result) + { + throw Exception( + "Timeout exceeded (" + toString(timeout_seconds) + " s) while flushing system log '" + demangle(typeid(*this).name()) + "'.", + ErrorCodes::TIMEOUT_EXCEEDED); + } +} + +#define INSTANTIATE_SYSTEM_LOG_BASE(ELEMENT) template class SystemLogBase; +SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_BASE) + +} diff --git a/src/Common/SystemLogBase.h b/src/Common/SystemLogBase.h new file mode 100644 index 00000000000..4b3ec5fe379 --- /dev/null +++ b/src/Common/SystemLogBase.h @@ -0,0 +1,109 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define SYSTEM_LOG_ELEMENTS(M) \ + M(AsynchronousMetricLogElement) \ + M(CrashLogElement) \ + M(MetricLogElement) \ + M(OpenTelemetrySpanLogElement) \ + M(PartLogElement) \ + M(QueryLogElement) \ + M(QueryThreadLogElement) \ + M(QueryViewsLogElement) \ + M(SessionLogElement) \ + M(TraceLogElement) \ + M(ZooKeeperLogElement) \ + M(TextLogElement) + +namespace Poco +{ +class Logger; +namespace Util +{ + class AbstractConfiguration; +} +} + +namespace DB +{ + +struct StorageID; + +class ISystemLog +{ +public: + virtual String getName() = 0; + //// force -- force table creation (used for SYSTEM FLUSH LOGS) + virtual void flush(bool force = false) = 0; + virtual void prepareTable() = 0; + + /// Start the background thread. + virtual void startup(); + + /// Stop the background flush thread before destructor. No more data will be written. + virtual void shutdown() = 0; + + virtual ~ISystemLog() = default; + + virtual void savingThreadFunction() = 0; + +protected: + ThreadFromGlobalPool saving_thread; + + /// Data shared between callers of add()/flush()/shutdown(), and the saving thread + std::mutex mutex; + + bool is_shutdown = false; + std::condition_variable flush_event; + + void stopFlushThread(); +}; + +template +class SystemLogBase : public ISystemLog +{ +public: + using Self = SystemLogBase; + + /** Append a record into log. + * Writing to table will be done asynchronously and in case of failure, record could be lost. + */ + void add(const LogElement & element); + + /// Flush data in the buffer to disk + void flush(bool force) override; + + String getName() override { return LogElement::name(); } + +protected: + Poco::Logger * log; + + // Queue is bounded. But its size is quite large to not block in all normal cases. + std::vector queue; + // An always-incrementing index of the first message currently in the queue. + // We use it to give a global sequential index to every message, so that we + // can wait until a particular message is flushed. This is used to implement + // synchronous log flushing for SYSTEM FLUSH LOGS. + uint64_t queue_front_index = 0; + // A flag that says we must create the tables even if the queue is empty. + bool is_force_prepare_tables = false; + // Requested to flush logs up to this index, exclusive + uint64_t requested_flush_up_to = 0; + // Flushed log up to this index, exclusive + uint64_t flushed_up_to = 0; + // Logged overflow message at this queue front index + uint64_t logged_queue_full_at_index = -1; +}; + +} diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h index f08172c8a77..881817f33be 100644 --- a/src/Common/Volnitsky.h +++ b/src/Common/Volnitsky.h @@ -372,7 +372,7 @@ public: , fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)} , fallback_searcher{needle_, needle_size} { - if (fallback) + if (fallback || fallback_searcher.force_fallback) return; hash = std::unique_ptr(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{}); @@ -393,7 +393,7 @@ public: const auto haystack_end = haystack + haystack_size; - if (fallback || haystack_size <= needle_size) + if (fallback || haystack_size <= needle_size || fallback_searcher.force_fallback) return fallback_searcher.search(haystack, haystack_end); /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. diff --git a/src/Common/ZooKeeper/CMakeLists.txt b/src/Common/ZooKeeper/CMakeLists.txt index 8a705eb46e6..34ebad9bb50 100644 --- a/src/Common/ZooKeeper/CMakeLists.txt +++ b/src/Common/ZooKeeper/CMakeLists.txt @@ -3,12 +3,7 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") add_headers_and_sources(clickhouse_common_zookeeper .) # for clickhouse server -# -# NOTE: this library depends from Interpreters (DB::SystemLog::add), -# and so it should be STATIC because otherwise: -# - it will either fail to compile with -Wl,--unresolved-symbols=report-all -# - or it will report errors at runtime -add_library(clickhouse_common_zookeeper STATIC ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources}) +add_library(clickhouse_common_zookeeper ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources}) target_compile_definitions (clickhouse_common_zookeeper PRIVATE -DZOOKEEPER_LOG) target_link_libraries (clickhouse_common_zookeeper PUBLIC diff --git a/src/Common/examples/CMakeLists.txt b/src/Common/examples/CMakeLists.txt index 14b5a6e60eb..9e551f3aa54 100644 --- a/src/Common/examples/CMakeLists.txt +++ b/src/Common/examples/CMakeLists.txt @@ -41,10 +41,10 @@ add_executable (space_saving space_saving.cpp) target_link_libraries (space_saving PRIVATE clickhouse_common_io) add_executable (integer_hash_tables_and_hashes integer_hash_tables_and_hashes.cpp) -target_link_libraries (integer_hash_tables_and_hashes PRIVATE dbms abseil_swiss_tables ch_contrib::sparsehash) +target_link_libraries (integer_hash_tables_and_hashes PRIVATE dbms ch_contrib::abseil_swiss_tables ch_contrib::sparsehash) add_executable (integer_hash_tables_benchmark integer_hash_tables_benchmark.cpp) -target_link_libraries (integer_hash_tables_benchmark PRIVATE dbms abseil_swiss_tables ch_contrib::sparsehash) +target_link_libraries (integer_hash_tables_benchmark PRIVATE dbms ch_contrib::abseil_swiss_tables ch_contrib::sparsehash) add_executable (cow_columns cow_columns.cpp) target_link_libraries (cow_columns PRIVATE clickhouse_common_io) @@ -78,8 +78,10 @@ target_link_libraries (shell_command_inout PRIVATE clickhouse_common_io) add_executable (executable_udf executable_udf.cpp) target_link_libraries (executable_udf PRIVATE dbms) -add_executable (hive_metastore_client hive_metastore_client.cpp) -target_link_libraries (hive_metastore_client PUBLIC ch_contrib::hivemetastore ch_contrib::thrift) +if (ENABLE_HIVE) + add_executable (hive_metastore_client hive_metastore_client.cpp) + target_link_libraries (hive_metastore_client PUBLIC ch_contrib::hivemetastore ch_contrib::thrift) +endif() add_executable (interval_tree interval_tree.cpp) target_link_libraries (interval_tree PRIVATE dbms) diff --git a/src/Common/getNumberOfPhysicalCPUCores.cpp b/src/Common/getNumberOfPhysicalCPUCores.cpp index 4c09f3d1ea0..2fc1dbf9669 100644 --- a/src/Common/getNumberOfPhysicalCPUCores.cpp +++ b/src/Common/getNumberOfPhysicalCPUCores.cpp @@ -1,27 +1,80 @@ #include "getNumberOfPhysicalCPUCores.h" #include +#if defined(OS_LINUX) +# include +# include +#endif #if USE_CPUID # include #endif #include +#if defined(OS_LINUX) +unsigned getCGroupLimitedCPUCores(unsigned default_cpu_count) +{ + // Try to look at cgroups limit if it is available. + auto read_from = [](const char * filename, int default_value) -> int { + std::ifstream infile(filename); + if (!infile.is_open()) + { + return default_value; + } + int idata; + if (infile >> idata) + return idata; + else + return default_value; + }; + + unsigned quota_count = default_cpu_count; + // Return the number of milliseconds per period process is guaranteed to run. + // -1 for no quota + int cgroup_quota = read_from("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", -1); + int cgroup_period = read_from("/sys/fs/cgroup/cpu/cpu.cfs_period_us", -1); + if (cgroup_quota > -1 && cgroup_period > 0) + { + quota_count = ceil(static_cast(cgroup_quota) / static_cast(cgroup_period)); + } + + // Share number (typically a number relative to 1024) (2048 typically expresses 2 CPUs worth of processing) + // -1 for no share setup + int cgroup_share = read_from("/sys/fs/cgroup/cpu/cpu.shares", -1); + // Convert 1024 to no shares setup + if (cgroup_share == 1024) + cgroup_share = -1; + +# define PER_CPU_SHARES 1024 + unsigned share_count = default_cpu_count; + if (cgroup_share > -1) + { + share_count = ceil(static_cast(cgroup_share) / static_cast(PER_CPU_SHARES)); + } + + return std::min(default_cpu_count, std::min(share_count, quota_count)); +} +#endif // OS_LINUX unsigned getNumberOfPhysicalCPUCores() { - static const unsigned number = [] - { -# if USE_CPUID + static const unsigned number = [] { + unsigned cpu_count = 0; // start with an invalid num +#if USE_CPUID + do + { cpu_raw_data_t raw_data; cpu_id_t data; /// On Xen VMs, libcpuid returns wrong info (zero number of cores). Fallback to alternative method. /// Also, libcpuid does not support some CPUs like AMD Hygon C86 7151. if (0 != cpuid_get_raw_data(&raw_data) || 0 != cpu_identify(&raw_data, &data) || data.num_logical_cpus == 0) - return std::thread::hardware_concurrency(); + { + // Just fallback + break; + } - unsigned res = data.num_cores * data.total_logical_cpus / data.num_logical_cpus; + cpu_count = data.num_cores * data.total_logical_cpus / data.num_logical_cpus; /// Also, libcpuid gives strange result on Google Compute Engine VMs. /// Example: @@ -29,14 +82,18 @@ unsigned getNumberOfPhysicalCPUCores() /// total_logical_cpus = 1, /// total number of logical cores on all sockets /// num_logical_cpus = 24. /// number of logical cores on current CPU socket /// It means two-way hyper-threading (24 / 12), but contradictory, 'total_logical_cpus' == 1. - - if (res != 0) - return res; -# endif + } while (false); +#endif /// As a fallback (also for non-x86 architectures) assume there are no hyper-threading on the system. /// (Actually, only Aarch64 is supported). - return std::thread::hardware_concurrency(); + if (cpu_count == 0) + cpu_count = std::thread::hardware_concurrency(); + +#if defined(OS_LINUX) + cpu_count = getCGroupLimitedCPUCores(cpu_count); +#endif // OS_LINUX + return cpu_count; }(); return number; } diff --git a/src/Common/memcmpSmall.h b/src/Common/memcmpSmall.h index db8641cb44d..57b9c731897 100644 --- a/src/Common/memcmpSmall.h +++ b/src/Common/memcmpSmall.h @@ -25,8 +25,240 @@ inline int cmp(T a, T b) /// We can process uninitialized memory in the functions below. /// Results don't depend on the values inside uninitialized memory but Memory Sanitizer cannot see it. /// Disable optimized functions if compile with Memory Sanitizer. +#if defined(__AVX512BW__) && defined(__AVX512VL__) && !defined(MEMORY_SANITIZER) +#include -#if defined(__SSE2__) && !defined(MEMORY_SANITIZER) + +/** All functions works under the following assumptions: + * - it's possible to read up to 15 excessive bytes after end of 'a' and 'b' region; + * - memory regions are relatively small and extra loop unrolling is not worth to do. + */ + +/** Variant when memory regions may have different sizes. + */ +template +inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) +{ + size_t min_size = std::min(a_size, b_size); + + for (size_t offset = 0; offset < min_size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a + offset)), + _mm_loadu_si128(reinterpret_cast(b + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + + if (offset >= min_size) + break; + + return detail::cmp(a[offset], b[offset]); + } + } + + return detail::cmp(a_size, b_size); +} + + +/** Variant when memory regions may have different sizes. + * But compare the regions as the smaller one is padded with zero bytes up to the size of the larger. + * It's needed to hold that: toFixedString('abc', 5) = 'abc' + * for compatibility with SQL standard. + */ +template +inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) +{ + size_t min_size = std::min(a_size, b_size); + + for (size_t offset = 0; offset < min_size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a + offset)), + _mm_loadu_si128(reinterpret_cast(b + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + + if (offset >= min_size) + break; + + return detail::cmp(a[offset], b[offset]); + } + } + + /// The strings are equal up to min_size. + /// If the rest of the larger string is zero bytes then the strings are considered equal. + + size_t max_size; + const Char * longest; + int cmp; + + if (a_size == b_size) + { + return 0; + } + else if (a_size > b_size) + { + max_size = a_size; + longest = a; + cmp = 1; + } + else + { + max_size = b_size; + longest = b; + cmp = -1; + } + + const __m128i zero16 = _mm_setzero_si128(); + + for (size_t offset = min_size; offset < max_size; offset += 16) + { + uint16_t mask = _mm_cmpneq_epi8_mask( + _mm_loadu_si128(reinterpret_cast(longest + offset)), + zero16); + + if (mask) + { + offset += __builtin_ctz(mask); + + if (offset >= max_size) + return 0; + return cmp; + } + } + + return 0; +} + + +/** Variant when memory regions have same size. + * TODO Check if the compiler can optimize previous function when the caller pass identical sizes. + */ +template +inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size) +{ + for (size_t offset = 0; offset < size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a + offset)), + _mm_loadu_si128(reinterpret_cast(b + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + + if (offset >= size) + return 0; + + return detail::cmp(a[offset], b[offset]); + } + } + + return 0; +} + + +/** Compare memory regions for equality. + */ +template +inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) +{ + if (a_size != b_size) + return false; + + for (size_t offset = 0; offset < a_size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a + offset)), + _mm_loadu_si128(reinterpret_cast(b + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + return offset >= a_size; + } + } + + return true; +} + + +/** Variant when the caller know in advance that the size is a multiple of 16. + */ +template +inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size) +{ + for (size_t offset = 0; offset < size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a + offset)), + _mm_loadu_si128(reinterpret_cast(b + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + return detail::cmp(a[offset], b[offset]); + } + } + + return 0; +} + + +/** Variant when the size is 16 exactly. + */ +template +inline int memcmp16(const Char * a, const Char * b) +{ + uint16_t mask = _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a)), + _mm_loadu_si128(reinterpret_cast(b)), _MM_CMPINT_NE); + + if (mask) + { + auto offset = __builtin_ctz(mask); + return detail::cmp(a[offset], b[offset]); + } + + return 0; +} + + +/** Variant when the size is 16 exactly. + */ +inline bool memequal16(const void * a, const void * b) +{ + return 0xFFFF == _mm_cmp_epi8_mask( + _mm_loadu_si128(reinterpret_cast(a)), + _mm_loadu_si128(reinterpret_cast(b)), _MM_CMPINT_EQ); +} + + +/** Compare memory region to zero */ +inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) +{ + const __m128i zero16 = _mm_setzero_si128(); + + for (size_t offset = 0; offset < size; offset += 16) + { + uint16_t mask = _mm_cmp_epi8_mask(zero16, + _mm_loadu_si128(reinterpret_cast(reinterpret_cast(data) + offset)), _MM_CMPINT_NE); + + if (mask) + { + offset += __builtin_ctz(mask); + return offset >= size; + } + } + + return true; +} + +#elif defined(__SSE2__) && !defined(MEMORY_SANITIZER) #include diff --git a/src/Common/mysqlxx/mysqlxx/Types.h b/src/Common/mysqlxx/mysqlxx/Types.h index 5fd9aa8bbc8..6ad4eb7c355 100644 --- a/src/Common/mysqlxx/mysqlxx/Types.h +++ b/src/Common/mysqlxx/mysqlxx/Types.h @@ -16,7 +16,15 @@ using MYSQL_ROW = char**; struct st_mysql_field; using MYSQL_FIELD = st_mysql_field; -enum struct enum_field_types; +enum struct enum_field_types { MYSQL_TYPE_DECIMAL, MYSQL_TYPE_TINY, + MYSQL_TYPE_SHORT, MYSQL_TYPE_LONG, + MYSQL_TYPE_FLOAT, MYSQL_TYPE_DOUBLE, + MYSQL_TYPE_NULL, MYSQL_TYPE_TIMESTAMP, + MYSQL_TYPE_LONGLONG, MYSQL_TYPE_INT24, + MYSQL_TYPE_DATE, MYSQL_TYPE_TIME, + MYSQL_TYPE_DATETIME, MYSQL_TYPE_YEAR, + MYSQL_TYPE_NEWDATE, MYSQL_TYPE_VARCHAR, + MYSQL_TYPE_BIT }; #endif diff --git a/src/Common/tests/gtest_poolbase.cpp b/src/Common/tests/gtest_poolbase.cpp new file mode 100644 index 00000000000..20c3281c964 --- /dev/null +++ b/src/Common/tests/gtest_poolbase.cpp @@ -0,0 +1,52 @@ +#include +#include +#include +#include +using namespace DB; + +class PoolObject +{ +public: + int x = 0; +}; + +class MyPoolBase : public PoolBase +{ +public: + using Object = PoolBase::Object; + using ObjectPtr = std::shared_ptr; + using Ptr = PoolBase::Ptr; + + int last_destroy_value = 0; + MyPoolBase() : PoolBase(100, &Poco::Logger::get("MyPoolBase")) { } + +protected: + ObjectPtr allocObject() override { return std::make_shared(); } + + void expireObject(ObjectPtr obj) override + { + LOG_TRACE(log, "expire object"); + ASSERT_TRUE(obj->x == 100); + last_destroy_value = obj->x; + } +}; + +TEST(PoolBase, testDestroy1) +{ + MyPoolBase pool; + { + auto obj_entry = pool.get(-1); + ASSERT_TRUE(!obj_entry.isNull()); + obj_entry->x = 100; + obj_entry.expire(); + } + ASSERT_EQ(1, pool.size()); + + { + auto obj_entry = pool.get(-1); + ASSERT_TRUE(!obj_entry.isNull()); + ASSERT_EQ(obj_entry->x, 0); + ASSERT_EQ(1, pool.size()); + } + ASSERT_EQ(100, pool.last_destroy_value); +} diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 3d0ebe86bf3..4c76d052f9b 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -228,6 +228,8 @@ String MonitorCommand::run() print(ret, "watch_count", state_machine.getTotalWatchesCount()); print(ret, "ephemerals_count", state_machine.getTotalEphemeralNodesCount()); print(ret, "approximate_data_size", state_machine.getApproximateDataSize()); + print(ret, "key_arena_size", state_machine.getKeyArenaSize()); + print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotBufSize()); #if defined(__linux__) || defined(__APPLE__) print(ret, "open_file_descriptor_count", getCurrentProcessFDCount()); diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 518d569ca67..8d5df7c35e9 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,7 +20,6 @@ namespace ErrorCodes { extern const int UNKNOWN_FORMAT_VERSION; extern const int UNKNOWN_SNAPSHOT; - extern const int LOGICAL_ERROR; } namespace @@ -41,20 +41,6 @@ namespace return base; } - std::string getBaseName(const String & path) - { - size_t basename_start = path.rfind('/'); - return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; - } - - String parentPath(const String & path) - { - auto rslash_pos = path.rfind('/'); - if (rslash_pos > 0) - return path.substr(0, rslash_pos); - return "/"; - } - void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out) { writeBinary(node.data, out); @@ -182,8 +168,11 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr { const auto & path = it->key; const auto & node = it->value; + /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id + /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be + /// slightly bigger than required. if (static_cast(node.stat.mzxid) > snapshot.snapshot_meta->get_last_log_idx()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx()); + break; writeBinary(path, out); writeNode(node, snapshot.version, out); @@ -292,7 +281,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial if (itr.key != "/") { auto parent_path = parentPath(itr.key); - storage.container.updateValue(parent_path, [&path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); }); + storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); }); } } @@ -348,8 +337,8 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t , session_id(storage->session_id_counter) , cluster_config(cluster_config_) { - storage->enableSnapshotMode(); snapshot_container_size = storage->container.snapshotSize(); + storage->enableSnapshotMode(snapshot_container_size); begin = storage->getSnapshotIteratorBegin(); session_and_timeout = storage->getActiveSessions(); acl_map = storage->acl_map.getMapping(); @@ -362,8 +351,8 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, const Sna , session_id(storage->session_id_counter) , cluster_config(cluster_config_) { - storage->enableSnapshotMode(); snapshot_container_size = storage->container.snapshotSize(); + storage->enableSnapshotMode(snapshot_container_size); begin = storage->getSnapshotIteratorBegin(); session_and_timeout = storage->getActiveSessions(); acl_map = storage->acl_map.getMapping(); diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 1ac1a584451..20d3bcbfd30 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -155,7 +155,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) { /// deserialize and apply snapshot to storage std::lock_guard lock(storage_and_responses_lock); - auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf); + auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr); storage = std::move(snapshot_deserialization_result.storage); latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta; cluster_config = snapshot_deserialization_result.cluster_config; @@ -212,14 +212,13 @@ void KeeperStateMachine::create_snapshot( } { - /// Must do it with lock (clearing elements from list) + /// Destroy snapshot with lock std::lock_guard lock(storage_and_responses_lock); + LOG_TRACE(log, "Clearing garbage after snapshot"); /// Turn off "snapshot mode" and clear outdate part of storage state storage->clearGarbageAfterSnapshot(); - /// Destroy snapshot with lock - snapshot.reset(); LOG_TRACE(log, "Cleared garbage after snapshot"); - + snapshot.reset(); } } catch (...) @@ -404,6 +403,20 @@ uint64_t KeeperStateMachine::getApproximateDataSize() const return storage->getApproximateDataSize(); } +uint64_t KeeperStateMachine::getKeyArenaSize() const +{ + std::lock_guard lock(storage_and_responses_lock); + return storage->getArenaDataSize(); +} + +uint64_t KeeperStateMachine::getLatestSnapshotBufSize() const +{ + std::lock_guard lock(snapshots_lock); + if (latest_snapshot_buf) + return latest_snapshot_buf->size(); + return 0; +} + ClusterConfigPtr KeeperStateMachine::getClusterConfig() const { std::lock_guard lock(cluster_config_lock); diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index 2803f4b9027..291b58e2498 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -97,6 +97,8 @@ public: uint64_t getSessionWithEphemeralNodesCount() const; uint64_t getTotalEphemeralNodesCount() const; uint64_t getApproximateDataSize() const; + uint64_t getKeyArenaSize() const; + uint64_t getLatestSnapshotBufSize() const; private: @@ -120,7 +122,7 @@ private: SnapshotsQueue & snapshots_queue; /// Mutex for snapshots - std::mutex snapshots_lock; + mutable std::mutex snapshots_lock; /// Lock for storage and responses_queue. It's important to process requests /// and push them to the responses queue while holding this lock. Otherwise diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 4f174e4e803..f6992815a6c 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -23,20 +24,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static String parentPath(const String & path) -{ - auto rslash_pos = path.rfind('/'); - if (rslash_pos > 0) - return path.substr(0, rslash_pos); - return "/"; -} - -static std::string getBaseName(const String & path) -{ - size_t basename_start = path.rfind('/'); - return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; -} - static String base64Encode(const String & decoded) { std::ostringstream ostr; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -155,12 +142,12 @@ static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & pat Strings paths_to_check_for_list_watches; if (event_type == Coordination::Event::CREATED) { - paths_to_check_for_list_watches.push_back(parent_path); /// Trigger list watches for parent + paths_to_check_for_list_watches.push_back(parent_path.toString()); /// Trigger list watches for parent } else if (event_type == Coordination::Event::DELETED) { paths_to_check_for_list_watches.push_back(path); /// Trigger both list watches for this path - paths_to_check_for_list_watches.push_back(parent_path); /// And for parent path + paths_to_check_for_list_watches.push_back(parent_path.toString()); /// And for parent path } /// CHANGED event never trigger list wathes @@ -244,7 +231,8 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr bool checkAuth(KeeperStorage & storage, int64_t session_id) const override { auto & container = storage.container; - auto parent_path = parentPath(zk_request->getPath()); + auto path = zk_request->getPath(); + auto parent_path = parentPath(path); auto it = container.find(parent_path); if (it == container.end()) @@ -297,8 +285,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr response.error = Coordination::Error::ZNODEEXISTS; return { response_ptr, undo }; } - auto child_path = getBaseName(path_created); - if (child_path.empty()) + if (getBaseName(path_created).size == 0) { response.error = Coordination::Error::ZBADARGUMENTS; return { response_ptr, undo }; @@ -330,15 +317,18 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr created_node.data = request.data; created_node.is_sequental = request.is_sequential; + auto [map_key, _] = container.insert(path_created, std::move(created_node)); + /// Take child path from key owned by map. + auto child_path = getBaseName(map_key->getKey()); + int32_t parent_cversion = request.parent_cversion; int64_t prev_parent_zxid; int32_t prev_parent_cversion; container.updateValue(parent_path, [child_path, zxid, &prev_parent_zxid, parent_cversion, &prev_parent_cversion] (KeeperStorage::Node & parent) { - parent.children.insert(child_path); - parent.size_bytes += child_path.size(); + parent.size_bytes += child_path.size; prev_parent_cversion = parent.stat.cversion; prev_parent_zxid = parent.stat.pzxid; @@ -356,14 +346,12 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr }); response.path_created = path_created; - container.insert(path_created, std::move(created_node)); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); undo = [&storage, prev_parent_zxid, prev_parent_cversion, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path, child_path, acl_id] { - storage.container.erase(path_created); storage.acl_map.removeUsage(acl_id); if (is_ephemeral) @@ -376,8 +364,10 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr undo_parent.stat.cversion = prev_parent_cversion; undo_parent.stat.pzxid = prev_parent_zxid; undo_parent.children.erase(child_path); - undo_parent.size_bytes -= child_path.size(); + undo_parent.size_bytes -= child_path.size; }); + + storage.container.erase(path_created); }; response.error = Coordination::Error::ZOK; @@ -504,33 +494,34 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr storage.acl_map.removeUsage(prev_node.acl_id); - auto child_basename = getBaseName(it->key); - container.updateValue(parentPath(request.path), [&child_basename] (KeeperStorage::Node & parent) + container.updateValue(parentPath(request.path), [child_basename = getBaseName(it->key)] (KeeperStorage::Node & parent) { --parent.stat.numChildren; ++parent.stat.cversion; parent.children.erase(child_basename); - parent.size_bytes -= child_basename.size(); + parent.size_bytes -= child_basename.size; }); response.error = Coordination::Error::ZOK; - + /// Erase full path from container after child removed from parent container.erase(request.path); - undo = [prev_node, &storage, path = request.path, child_basename] + undo = [prev_node, &storage, path = request.path] { if (prev_node.stat.ephemeralOwner != 0) storage.ephemerals[prev_node.stat.ephemeralOwner].emplace(path); storage.acl_map.addUsage(prev_node.acl_id); - storage.container.insert(path, prev_node); - storage.container.updateValue(parentPath(path), [&child_basename] (KeeperStorage::Node & parent) + /// Dangerous place: we are adding StringRef to child into children unordered_hash set. + /// That's why we are taking getBaseName from inserted key, not from the path from request object. + auto [map_key, _] = storage.container.insert(path, prev_node); + storage.container.updateValue(parentPath(path), [child_name = getBaseName(map_key->getKey())] (KeeperStorage::Node & parent) { ++parent.stat.numChildren; --parent.stat.cversion; - parent.children.insert(child_basename); - parent.size_bytes += child_basename.size(); + parent.children.insert(child_name); + parent.size_bytes += child_name.size; }); }; } @@ -672,6 +663,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperListResponse & response = dynamic_cast(*response_ptr); Coordination::ZooKeeperListRequest & request = dynamic_cast(*zk_request); + auto it = container.find(request.path); if (it == container.end()) { @@ -683,7 +675,10 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - response.names.insert(response.names.end(), it->value.children.begin(), it->value.children.end()); + response.names.reserve(it->value.children.size()); + + for (const auto child : it->value.children) + response.names.push_back(child.toString()); response.stat = it->value.stat; response.error = Coordination::Error::ZOK; @@ -1092,15 +1087,17 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina { for (const auto & ephemeral_path : it->second) { - container.erase(ephemeral_path); container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (KeeperStorage::Node & parent) { --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(getBaseName(ephemeral_path)); - parent.size_bytes -= getBaseName(ephemeral_path).size(); + auto base_name = getBaseName(ephemeral_path); + parent.children.erase(base_name); + parent.size_bytes -= base_name.size; }); + container.erase(ephemeral_path); + auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED); results.insert(results.end(), responses.begin(), responses.end()); } diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 11d191b7f50..cbf33be61a0 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -8,16 +8,17 @@ #include #include #include -#include #include +#include + namespace DB { struct KeeperStorageRequestProcessor; using KeeperStorageRequestProcessorPtr = std::shared_ptr; using ResponseCallback = std::function; -using ChildrenSet = std::unordered_set; +using ChildrenSet = absl::flat_hash_set; using SessionAndTimeout = std::unordered_map; struct KeeperStorageSnapshot; @@ -28,6 +29,7 @@ struct KeeperStorageSnapshot; class KeeperStorage { public: + struct Node { String data; @@ -158,9 +160,9 @@ public: /// Set of methods for creating snapshots /// Turn on snapshot mode, so data inside Container is not deleted, but replaced with new version. - void enableSnapshotMode() + void enableSnapshotMode(size_t up_to_size) { - container.enableSnapshotMode(); + container.enableSnapshotMode(up_to_size); } /// Turn off snapshot mode. @@ -203,6 +205,12 @@ public: return container.getApproximateDataSize(); } + uint64_t getArenaDataSize() const + { + return container.keyArenaSize(); + } + + uint64_t getTotalWatchesCount() const; uint64_t getWatchedPathsCount() const diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index 7704825f830..b1d72578530 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -1,8 +1,11 @@ #pragma once #include +#include +#include #include #include #include +#include namespace DB { @@ -10,11 +13,12 @@ namespace DB template struct ListNode { - std::string key; + StringRef key; V value; - bool active_in_map; -}; + bool active_in_map{true}; + bool free_key{false}; +}; template class SnapshotableHashTable @@ -23,11 +27,15 @@ private: using ListElem = ListNode; using List = std::list; - using IndexMap = std::unordered_map; + using Mapped = typename List::iterator; + using IndexMap = HashMap; List list; IndexMap map; bool snapshot_mode{false}; + /// Allows to avoid additional copies in updateValue function + size_t snapshot_up_to_size = 0; + ArenaWithFreeLists arena; uint64_t approximate_data_size{0}; @@ -105,51 +113,68 @@ private: } } + StringRef copyStringInArena(const std::string & value_to_copy) + { + size_t value_to_copy_size = value_to_copy.size(); + char * place_for_key = arena.alloc(value_to_copy_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data()), value_to_copy_size); + StringRef updated_value{place_for_key, value_to_copy_size}; + + return updated_value; + } + + public: using iterator = typename List::iterator; using const_iterator = typename List::const_iterator; - using reverse_iterator = typename List::reverse_iterator; - using const_reverse_iterator = typename List::const_reverse_iterator; using ValueUpdater = std::function; - bool insert(const std::string & key, const V & value) + std::pair insert(const std::string & key, const V & value) { - auto it = map.find(key); - if (it == map.end()) + size_t hash_value = map.hash(key); + auto it = map.find(key, hash_value); + + if (!it) { - ListElem elem{key, value, true}; + ListElem elem{copyStringInArena(key), value, true}; auto itr = list.insert(list.end(), elem); - map.emplace(itr->key, itr); + bool inserted; + map.emplace(itr->key, it, inserted, hash_value); + assert(inserted); + + it->getMapped() = itr; updateDataSize(INSERT, key.size(), value.sizeInBytes(), 0); - return true; + return std::make_pair(it, true); } - return false; + return std::make_pair(it, false); } - void insertOrReplace(const std::string & key, const V & value) { - auto it = map.find(key); - uint64_t old_value_size = it == map.end() ? 0 : it->second->value.sizeInBytes(); + size_t hash_value = map.hash(key); + auto it = map.find(key, hash_value); + uint64_t old_value_size = it == map.end() ? 0 : it->getMapped()->value.sizeInBytes(); if (it == map.end()) { - ListElem elem{key, value, true}; + ListElem elem{copyStringInArena(key), value, true}; auto itr = list.insert(list.end(), elem); - map.emplace(itr->key, itr); + bool inserted; + map.emplace(itr->key, it, inserted, hash_value); + assert(inserted); + it->getMapped() = itr; } else { - auto list_itr = it->second; + auto list_itr = it->getMapped(); if (snapshot_mode) { - ListElem elem{key, value, true}; + ListElem elem{list_itr->key, value, true}; list_itr->active_in_map = false; auto new_list_itr = list.insert(list.end(), elem); - map.erase(it); - map.emplace(new_list_itr->key, new_list_itr); + it->getMapped() = new_list_itr; } else { @@ -165,16 +190,18 @@ public: if (it == map.end()) return false; - auto list_itr = it->second; + auto list_itr = it->getMapped(); uint64_t old_data_size = list_itr->value.sizeInBytes(); if (snapshot_mode) { list_itr->active_in_map = false; - map.erase(it); + list_itr->free_key = true; + map.erase(it->getKey()); } else { - map.erase(it); + map.erase(it->getKey()); + arena.free(const_cast(list_itr->key.data), list_itr->key.size); list.erase(list_itr); } @@ -187,48 +214,62 @@ public: return map.find(key) != map.end(); } - const_iterator updateValue(const std::string & key, ValueUpdater updater) + const_iterator updateValue(StringRef key, ValueUpdater updater) { - auto it = map.find(key); + size_t hash_value = map.hash(key); + auto it = map.find(key, hash_value); assert(it != map.end()); - auto list_itr = it->second; + auto list_itr = it->getMapped(); uint64_t old_value_size = list_itr->value.sizeInBytes(); const_iterator ret; if (snapshot_mode) { - auto elem_copy = *(list_itr); - list_itr->active_in_map = false; - map.erase(it); - updater(elem_copy.value); - auto itr = list.insert(list.end(), elem_copy); - map.emplace(itr->key, itr); - ret = itr; + /// We in snapshot mode but updating some node which is already more + /// fresh than snapshot distance. So it will not participate in + /// snapshot and we don't need to copy it. + size_t distance = std::distance(list.begin(), list_itr); + if (distance < snapshot_up_to_size) + { + auto elem_copy = *(list_itr); + list_itr->active_in_map = false; + updater(elem_copy.value); + auto itr = list.insert(list.end(), elem_copy); + it->getMapped() = itr; + ret = itr; + } + else + { + updater(list_itr->value); + ret = list_itr; + } } else { updater(list_itr->value); ret = list_itr; } - updateDataSize(UPDATE_VALUE, key.size(), ret->value.sizeInBytes(), old_value_size); + + updateDataSize(UPDATE_VALUE, key.size, ret->value.sizeInBytes(), old_value_size); return ret; } - const_iterator find(const std::string & key) const + const_iterator find(StringRef key) const { auto map_it = map.find(key); if (map_it != map.end()) - return map_it->second; + return map_it->getMapped(); return list.end(); } - const V & getValue(const std::string & key) const + + const V & getValue(StringRef key) const { auto it = map.find(key); - assert(it != map.end()); - return it->second->value; + assert(it); + return it->getMapped()->value; } void clearOutdatedNodes() @@ -239,29 +280,39 @@ public: { if (!itr->active_in_map) { - updateDataSize(CLEAR_OUTDATED_NODES, itr->key.size(), itr->value.sizeInBytes(), 0); + updateDataSize(CLEAR_OUTDATED_NODES, itr->key.size, itr->value.sizeInBytes(), 0); + if (itr->free_key) + arena.free(const_cast(itr->key.data), itr->key.size); itr = list.erase(itr); } else + { + assert(!itr->free_key); itr++; + } } } void clear() { - list.clear(); map.clear(); + for (auto itr = list.begin(); itr != list.end(); ++itr) + arena.free(const_cast(itr->key.data), itr->key.size); + list.clear(); updateDataSize(CLEAR, 0, 0, 0); } - void enableSnapshotMode() + void enableSnapshotMode(size_t up_to_size) { snapshot_mode = true; + snapshot_up_to_size = up_to_size; } void disableSnapshotMode() { + snapshot_mode = false; + snapshot_up_to_size = 0; } size_t size() const @@ -279,15 +330,15 @@ public: return approximate_data_size; } + uint64_t keyArenaSize() const + { + return arena.size(); + } + iterator begin() { return list.begin(); } const_iterator begin() const { return list.cbegin(); } iterator end() { return list.end(); } const_iterator end() const { return list.cend(); } - - reverse_iterator rbegin() { return list.rbegin(); } - const_reverse_iterator rbegin() const { return list.crbegin(); } - reverse_iterator rend() { return list.rend(); } - const_reverse_iterator rend() const { return list.crend(); } }; diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index bd13a70252e..5d7b78d6a28 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -1,10 +1,13 @@ #include + #include #include +#include + #include #include #include -#include +#include namespace DB @@ -16,20 +19,6 @@ namespace ErrorCodes extern const int CORRUPTED_DATA; } -static String parentPath(const String & path) -{ - auto rslash_pos = path.rfind('/'); - if (rslash_pos > 0) - return path.substr(0, rslash_pos); - return "/"; -} - -static std::string getBaseName(const String & path) -{ - size_t basename_start = path.rfind('/'); - return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; -} - int64_t getZxidFromName(const std::string & filename) { std::filesystem::path path(filename); @@ -148,7 +137,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L if (itr.key != "/") { auto parent_path = parentPath(itr.key); - storage.container.updateValue(parent_path, [&path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); value.stat.numChildren++; }); + storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); value.stat.numChildren++; }); } } diff --git a/src/Coordination/pathUtils.cpp b/src/Coordination/pathUtils.cpp new file mode 100644 index 00000000000..1e1da339d2e --- /dev/null +++ b/src/Coordination/pathUtils.cpp @@ -0,0 +1,38 @@ +#include +#include + +namespace DB +{ + +static size_t findLastSlash(StringRef path) +{ + if (path.size == 0) + return std::string::npos; + + for (size_t i = path.size - 1; i > 0; --i) + { + if (path.data[i] == '/') + return i; + } + + if (path.data[0] == '/') + return 0; + + return std::string::npos; +} + +StringRef parentPath(StringRef path) +{ + auto rslash_pos = findLastSlash(path); + if (rslash_pos > 0) + return StringRef{path.data, rslash_pos}; + return "/"; +} + +StringRef getBaseName(StringRef path) +{ + size_t basename_start = findLastSlash(path); + return StringRef{path.data + basename_start + 1, path.size - basename_start - 1}; +} + +} diff --git a/src/Coordination/pathUtils.h b/src/Coordination/pathUtils.h new file mode 100644 index 00000000000..69ed2d8b177 --- /dev/null +++ b/src/Coordination/pathUtils.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +namespace DB +{ + +StringRef parentPath(StringRef path); + +StringRef getBaseName(StringRef path); + +} diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index d274ee34a88..9c434ebb653 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -844,10 +844,10 @@ struct IntNode TEST_P(CoordinationTest, SnapshotableHashMapSimple) { DB::SnapshotableHashTable hello; - EXPECT_TRUE(hello.insert("hello", 5)); + EXPECT_TRUE(hello.insert("hello", 5).second); EXPECT_TRUE(hello.contains("hello")); EXPECT_EQ(hello.getValue("hello"), 5); - EXPECT_FALSE(hello.insert("hello", 145)); + EXPECT_FALSE(hello.insert("hello", 145).second); EXPECT_EQ(hello.getValue("hello"), 5); hello.updateValue("hello", [](IntNode & value) { value = 7; }); EXPECT_EQ(hello.getValue("hello"), 7); @@ -859,10 +859,10 @@ TEST_P(CoordinationTest, SnapshotableHashMapSimple) TEST_P(CoordinationTest, SnapshotableHashMapTrySnapshot) { DB::SnapshotableHashTable map_snp; - EXPECT_TRUE(map_snp.insert("/hello", 7)); - EXPECT_FALSE(map_snp.insert("/hello", 145)); - map_snp.enableSnapshotMode(); - EXPECT_FALSE(map_snp.insert("/hello", 145)); + EXPECT_TRUE(map_snp.insert("/hello", 7).second); + EXPECT_FALSE(map_snp.insert("/hello", 145).second); + map_snp.enableSnapshotMode(100000); + EXPECT_FALSE(map_snp.insert("/hello", 145).second); map_snp.updateValue("/hello", [](IntNode & value) { value = 554; }); EXPECT_EQ(map_snp.getValue("/hello"), 554); EXPECT_EQ(map_snp.snapshotSize(), 2); @@ -880,7 +880,7 @@ TEST_P(CoordinationTest, SnapshotableHashMapTrySnapshot) EXPECT_EQ(itr, map_snp.end()); for (size_t i = 0; i < 5; ++i) { - EXPECT_TRUE(map_snp.insert("/hello" + std::to_string(i), i)); + EXPECT_TRUE(map_snp.insert("/hello" + std::to_string(i), i).second); } EXPECT_EQ(map_snp.getValue("/hello3"), 3); @@ -951,7 +951,7 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) hello.clear(); EXPECT_EQ(hello.getApproximateDataSize(), 0); - hello.enableSnapshotMode(); + hello.enableSnapshotMode(10000); hello.insert("hello", 1); EXPECT_EQ(hello.getApproximateDataSize(), 9); hello.updateValue("hello", [](IntNode & value) { value = 2; }); @@ -984,7 +984,7 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) world.erase("world"); EXPECT_EQ(world.getApproximateDataSize(), 0); - world.enableSnapshotMode(); + world.enableSnapshotMode(100000); world.insert("world", n1); EXPECT_EQ(world.getApproximateDataSize(), 98); world.updateValue("world", [&](Node & value) { value = n2; }); diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp index 9a42f752db2..18c43d8c45f 100644 --- a/src/Core/BackgroundSchedulePool.cpp +++ b/src/Core/BackgroundSchedulePool.cpp @@ -5,7 +5,6 @@ #include #include #include -#include namespace DB @@ -246,7 +245,6 @@ void BackgroundSchedulePool::threadFunction() setThreadName(thread_name.c_str()); attachToThreadGroup(); - SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); }); while (!shutdown) { @@ -273,7 +271,6 @@ void BackgroundSchedulePool::delayExecutionThreadFunction() setThreadName((thread_name + "/D").c_str()); attachToThreadGroup(); - SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); }); while (!shutdown) { diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index fb230f412f0..50f6be23f83 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -204,6 +204,7 @@ namespace MySQLReplication case MYSQL_TYPE_DATE: case MYSQL_TYPE_DATETIME: case MYSQL_TYPE_NEWDATE: + case MYSQL_TYPE_YEAR: { /// No data here. column_meta.emplace_back(0); @@ -214,7 +215,9 @@ namespace MySQLReplication case MYSQL_TYPE_DOUBLE: case MYSQL_TYPE_TIMESTAMP2: case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIME2: case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_GEOMETRY: { column_meta.emplace_back(UInt16(meta[pos])); pos += 1; @@ -432,6 +435,98 @@ namespace MySQLReplication row.push_back(Field(date_day_number.toUnderType())); break; } + case MYSQL_TYPE_YEAR: { + Int16 val = 0; + payload.readStrict(reinterpret_cast(&val), 1); + row.push_back(Field{UInt16{static_cast(val + 1900)}}); + break; + } + case MYSQL_TYPE_TIME2: + { + UInt64 uintpart = 0UL; + Int32 frac = 0U; + Int64 ltime; + Int64 intpart; + switch (meta) + { + case 0: + { + readBigEndianStrict(payload, reinterpret_cast(&uintpart), 3); + intpart = uintpart - 0x800000L; + ltime = intpart << 24; + break; + } + case 1: + case 2: + { + readBigEndianStrict(payload, reinterpret_cast(&uintpart), 3); + intpart = uintpart - 0x800000L; + readBigEndianStrict(payload, reinterpret_cast(&frac), 1); + if (intpart < 0 && frac > 0) + { + intpart ++; + frac -= 0x100; + } + frac = frac * 10000; + ltime = intpart << 24; + break; + } + case 3: + case 4: + { + readBigEndianStrict(payload, reinterpret_cast(&uintpart), 3); + intpart = uintpart - 0x800000L; + readBigEndianStrict(payload, reinterpret_cast(&frac), 2); + if (intpart < 0 && frac > 0) + { + intpart ++; + frac -= 0x10000; + } + frac = frac * 100; + ltime = intpart << 24; + break; + } + case 5: + case 6: + { + readBigEndianStrict(payload, reinterpret_cast(&uintpart), 6); + intpart = uintpart - 0x800000000000L; + ltime = intpart; + frac = std::abs(intpart % (1L << 24)); + break; + } + default: + { + readBigEndianStrict(payload, reinterpret_cast(&uintpart), 3); + intpart = uintpart - 0x800000L; + ltime = intpart << 24; + break; + } + } + Int64 hh, mm, ss; + bool negative = false; + if (intpart == 0) + { + hh = 0; + mm = 0; + ss = 0; + } + else + { + if (ltime < 0) negative= true; + UInt64 ultime = std::abs(ltime); + intpart = ultime >> 24; + hh = (intpart >> 12) % (1 << 10); + mm = (intpart >> 6) % (1 << 6); + ss = intpart % (1 << 6); + } + + Int64 time_micro = 0; + time_micro = (hh * 3600 + mm * 60 + ss) * 1000000 + std::abs(frac); + if (negative) time_micro = - time_micro; + row.push_back(Field{Int64{time_micro}}); + break; + } case MYSQL_TYPE_DATETIME2: { Int64 val = 0; @@ -585,6 +680,14 @@ namespace MySQLReplication } break; } + case MYSQL_TYPE_SET: + { + UInt32 size = (meta & 0xff); + Bitmap bitmap1; + readBitmap(payload, bitmap1, size); + row.push_back(Field{UInt64{bitmap1.to_ulong()}}); + break; + } case MYSQL_TYPE_BIT: { UInt32 bits = ((meta >> 8) * 8) + (meta & 0xff); @@ -631,6 +734,7 @@ namespace MySQLReplication row.push_back(Field{String{val}}); break; } + case MYSQL_TYPE_GEOMETRY: case MYSQL_TYPE_BLOB: { UInt32 size = 0; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 21beeecb05b..c59a2bf8361 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -75,7 +75,11 @@ class IColumn; M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ + M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ + M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \ + M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ + M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \ M(UInt64, hsts_max_age, 0, "Expired time for hsts. 0 means disable HSTS.", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ @@ -491,6 +495,7 @@ class IColumn; \ M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ + M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \ @@ -550,7 +555,7 @@ class IColumn; /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ - + M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/DataTypes/DataTypeString.cpp b/src/DataTypes/DataTypeString.cpp index 7fa3a394be8..b52d2024204 100644 --- a/src/DataTypes/DataTypeString.cpp +++ b/src/DataTypes/DataTypeString.cpp @@ -92,5 +92,7 @@ void registerDataTypeString(DataTypeFactory & factory) factory.registerAlias("BINARY LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); factory.registerAlias("BINARY VARYING", "String", DataTypeFactory::CaseInsensitive); factory.registerAlias("VARBINARY", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("GEOMETRY", "String", DataTypeFactory::CaseInsensitive); //mysql + } } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index ad6d4e2943b..a5e9868cf89 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -32,6 +32,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; extern const int ILLEGAL_INDEX; + extern const int LOGICAL_ERROR; } @@ -156,8 +157,19 @@ MutableColumnPtr DataTypeTuple::createColumn() const MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const { - const auto & element_serializations = - assert_cast(serialization).getElementsSerializations(); + /// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed + /// several times to allow to reconstruct the substream path name. + /// Here we don't need substream path name, so we drop first several wrapper serializations. + + const auto * current_serialization = &serialization; + while (const auto * serialization_named = typeid_cast(current_serialization)) + current_serialization = serialization_named->getNested().get(); + + const auto * serialization_tuple = typeid_cast(current_serialization); + if (!serialization_tuple) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected serialization to create column of type Tuple"); + + const auto & element_serializations = serialization_tuple->getElementsSerializations(); size_t size = elems.size(); assert(element_serializations.size() == size); diff --git a/src/DataTypes/DataTypesNumber.cpp b/src/DataTypes/DataTypesNumber.cpp index 0c9a410077f..d85f52a7e6b 100644 --- a/src/DataTypes/DataTypesNumber.cpp +++ b/src/DataTypes/DataTypesNumber.cpp @@ -86,7 +86,10 @@ void registerDataTypeNumbers(DataTypeFactory & factory) factory.registerAlias("INT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); factory.registerAlias("INTEGER UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); factory.registerAlias("BIGINT UNSIGNED", "UInt64", DataTypeFactory::CaseInsensitive); - factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL + factory.registerAlias("SET", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL + factory.registerAlias("YEAR", "UInt16", DataTypeFactory::CaseInsensitive); + factory.registerAlias("TIME", "Int64", DataTypeFactory::CaseInsensitive); } } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 171ed5d1225..b8e0061b67e 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -528,6 +528,7 @@ inline bool isBool(const DataTypePtr & data_type) template constexpr bool IsDataTypeDecimal = false; template constexpr bool IsDataTypeNumber = false; template constexpr bool IsDataTypeDateOrDateTime = false; +template constexpr bool IsDataTypeEnum = false; template constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal || IsDataTypeNumber; @@ -552,4 +553,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime = tru template <> inline constexpr bool IsDataTypeDateOrDateTime = true; template <> inline constexpr bool IsDataTypeDateOrDateTime = true; +template +class DataTypeEnum; + +template inline constexpr bool IsDataTypeEnum> = true; + } diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index e3b535a2a11..30ee5e98b74 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -37,10 +37,11 @@ void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr) con { size_t size; readVarUInt(size, istr); - field = Array(size); + field = Array(); Array & arr = get(field); + arr.reserve(size); for (size_t i = 0; i < size; ++i) - nested->deserializeBinary(arr[i], istr); + nested->deserializeBinary(arr.emplace_back(), istr); } diff --git a/src/DataTypes/Serializations/SerializationInfo.cpp b/src/DataTypes/Serializations/SerializationInfo.cpp index 42d3d14b672..22df95fc8f7 100644 --- a/src/DataTypes/Serializations/SerializationInfo.cpp +++ b/src/DataTypes/Serializations/SerializationInfo.cpp @@ -158,6 +158,19 @@ void SerializationInfoByName::add(const SerializationInfoByName & other) } } +void SerializationInfoByName::replaceData(const SerializationInfoByName & other) +{ + for (const auto & [name, new_info] : other) + { + auto & old_info = (*this)[name]; + + if (old_info) + old_info->replaceData(*new_info); + else + old_info = new_info->clone(); + } +} + void SerializationInfoByName::writeJSON(WriteBuffer & out) const { Poco::JSON::Object object; diff --git a/src/DataTypes/Serializations/SerializationInfo.h b/src/DataTypes/Serializations/SerializationInfo.h index f7af5d77217..d83fc16f2f6 100644 --- a/src/DataTypes/Serializations/SerializationInfo.h +++ b/src/DataTypes/Serializations/SerializationInfo.h @@ -89,6 +89,11 @@ public: void add(const Block & block); void add(const SerializationInfoByName & other); + /// Takes data from @other, but keeps current serialization kinds. + /// If column exists in @other infos, but not in current infos, + /// it's cloned to current infos. + void replaceData(const SerializationInfoByName & other); + void writeJSON(WriteBuffer & out) const; void readJSON(ReadBuffer & in); }; diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 3f17061a744..24d06d8f3b2 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -53,13 +53,15 @@ void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr) const { size_t size; readVarUInt(size, istr); - field = Map(size); - for (auto & elem : field.get()) + field = Map(); + Map & map = field.get(); + map.reserve(size); + for (size_t i = 0; i < size; ++i) { Tuple tuple(2); key->deserializeBinary(tuple[0], istr); value->deserializeBinary(tuple[1], istr); - elem = std::move(tuple); + map.push_back(std::move(tuple)); } } diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h index 91db0cf67f4..343b96c16e3 100644 --- a/src/DataTypes/Serializations/SerializationNamed.h +++ b/src/DataTypes/Serializations/SerializationNamed.h @@ -5,6 +5,11 @@ namespace DB { +/// Serialization wrapper that acts like nested serialization, +/// but adds a passed name to the substream path like the +/// read column was the tuple element with this name. +/// It's used while reading subcolumns of complex types. +/// In particular while reading components of named tuples. class SerializationNamed final : public SerializationWrapper { private: diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp index c5e2b31e043..4b6b79151bc 100644 --- a/src/DataTypes/Serializations/SerializationNumber.cpp +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include namespace DB diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index cd5a6b65a3c..8dc15fc9841 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -44,11 +43,11 @@ void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr) con { const size_t size = elems.size(); - Tuple tuple(size); - for (const auto i : collections::range(0, size)) - elems[i]->deserializeBinary(tuple[i], istr); - - field = tuple; + field = Tuple(); + Tuple & tuple = get(field); + tuple.reserve(size); + for (size_t i = 0; i < size; ++i) + elems[i]->deserializeBinary(tuple.emplace_back(), istr); } void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const @@ -73,7 +72,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl) // Check that all columns now have the same size. size_t new_size = column.size(); - for (auto i : collections::range(1, num_elems)) + for (size_t i = 1; i < num_elems; ++i) { const auto & element_column = extractElementColumn(column, i); if (element_column.size() != new_size) @@ -87,7 +86,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl) } catch (...) { - for (const auto & i : collections::range(0, num_elems)) + for (size_t i = 0; i < num_elems; ++i) { auto & element_column = extractElementColumn(column, i); if (element_column.size() > old_size) @@ -102,7 +101,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) { addElementSafe(elems.size(), column, [&] { - for (const auto & i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) elems[i]->deserializeBinary(extractElementColumn(column, i), istr); }); } @@ -110,7 +109,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('(', ostr); - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { if (i != 0) writeChar(',', ostr); @@ -126,7 +125,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co addElementSafe(elems.size(), column, [&] { - for (const auto i : collections::range(0, size)) + for (size_t i = 0; i < size; ++i) { skipWhitespaceIfAny(istr); if (i != 0) @@ -158,7 +157,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu && have_explicit_names) { writeChar('{', ostr); - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { if (i != 0) { @@ -173,7 +172,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu else { writeChar('[', ostr); - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { if (i != 0) writeChar(',', ostr); @@ -195,7 +194,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr addElementSafe(elems.size(), column, [&] { // Require all elements but in arbitrary order. - for (auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { if (i > 0) { @@ -226,7 +225,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr addElementSafe(elems.size(), column, [&] { - for (const auto i : collections::range(0, size)) + for (size_t i = 0; i < size; ++i) { skipWhitespaceIfAny(istr); if (i != 0) @@ -246,7 +245,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCString("", ostr); - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { writeCString("", ostr); elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings); @@ -257,7 +256,7 @@ void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { if (i != 0) writeChar(settings.csv.tuple_delimiter, ostr); @@ -270,7 +269,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, addElementSafe(elems.size(), column, [&] { const size_t size = elems.size(); - for (const auto i : collections::range(0, size)) + for (size_t i = 0; i < size; ++i) { if (i != 0) { @@ -362,7 +361,7 @@ void SerializationTuple::serializeBinaryBulkWithMultipleStreams( { auto * tuple_state = checkAndGetState(state); - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) { const auto & element_col = extractElementColumn(column, i); elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); @@ -382,7 +381,7 @@ void SerializationTuple::deserializeBinaryBulkWithMultipleStreams( auto & column_tuple = assert_cast(*mutable_column); settings.avg_value_size_hint = 0; - for (const auto i : collections::range(0, elems.size())) + for (size_t i = 0; i < elems.size(); ++i) elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); } diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp index ce5a3e9a947..9dbe611537b 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -315,6 +316,47 @@ getTableOutput(const String & database_name, const String & table_name, ContextM return std::move(res.pipeline); } +static inline String reWriteMysqlQueryColumn(mysqlxx::Pool::Entry & connection, const String & database_name, const String & table_name, const Settings & global_settings) +{ + Block tables_columns_sample_block + { + { std::make_shared(), "column_name" }, + { std::make_shared(), "column_type" } + }; + + const String & query = "SELECT COLUMN_NAME AS column_name, COLUMN_TYPE AS column_type FROM INFORMATION_SCHEMA.COLUMNS" + " WHERE TABLE_SCHEMA = '" + backQuoteIfNeed(database_name) + + "' AND TABLE_NAME = '" + backQuoteIfNeed(table_name) + "' ORDER BY ORDINAL_POSITION"; + + StreamSettings mysql_input_stream_settings(global_settings, false, true); + auto mysql_source = std::make_unique(connection, query, tables_columns_sample_block, mysql_input_stream_settings); + + Block block; + WriteBufferFromOwnString query_columns; + QueryPipeline pipeline(std::move(mysql_source)); + PullingPipelineExecutor executor(pipeline); + while (executor.pull(block)) + { + const auto & column_name_col = *block.getByPosition(0).column; + const auto & column_type_col = *block.getByPosition(1).column; + size_t rows = block.rows(); + for (size_t i = 0; i < rows; ++i) + { + String column_name = column_name_col[i].safeGet(); + String column_type = column_type_col[i].safeGet(); + //we can do something special conversion to guarantee select results is the same as the binlog parse results + if (column_type.starts_with("set")) + { + query_columns << (backQuote(column_name) + " + 0"); + } else + query_columns << backQuote(column_name); + query_columns << ","; + } + } + String query_columns_str = query_columns.str(); + return query_columns_str.substr(0, query_columns_str.length() - 1); +} + static inline void dumpDataForTables( mysqlxx::Pool::Entry & connection, const std::unordered_map & need_dumping_tables, const String & query_prefix, const String & database_name, const String & mysql_database_name, @@ -334,9 +376,10 @@ static inline void dumpDataForTables( auto pipeline = getTableOutput(database_name, table_name, query_context); StreamSettings mysql_input_stream_settings(context->getSettingsRef()); - auto input = std::make_unique( - connection, "SELECT * FROM " + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name), - pipeline.getHeader(), mysql_input_stream_settings); + String mysql_select_all_query = "SELECT " + reWriteMysqlQueryColumn(connection, mysql_database_name, table_name, context->getSettings()) + " FROM " + + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name); + LOG_INFO(&Poco::Logger::get("MaterializedMySQLSyncThread(" + database_name + ")"), "mysql_select_all_query is {}", mysql_select_all_query); + auto input = std::make_unique(connection, mysql_select_all_query, pipeline.getHeader(), mysql_input_stream_settings); auto counting = std::make_shared(pipeline.getHeader()); Pipe pipe(std::move(input)); pipe.addTransform(counting); diff --git a/src/Dictionaries/CMakeLists.txt b/src/Dictionaries/CMakeLists.txt index 31b1ac67304..19e82c45cc2 100644 --- a/src/Dictionaries/CMakeLists.txt +++ b/src/Dictionaries/CMakeLists.txt @@ -33,9 +33,7 @@ target_link_libraries(clickhouse_dictionaries string_utils ) -target_link_libraries(clickhouse_dictionaries - PUBLIC - abseil_swiss_tables) +target_link_libraries(clickhouse_dictionaries PUBLIC ch_contrib::abseil_swiss_tables) if (TARGET ch_contrib::cassandra) target_link_libraries(clickhouse_dictionaries PRIVATE ch_contrib::cassandra) diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 4b242ee7fd9..cad3e3b8799 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -271,7 +271,6 @@ ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & k if (dictionary_key_type == DictionaryKeyType::Complex) dict_struct.validateKeyTypes(key_types); - DictionaryKeysArenaHolder arena_holder; DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); const auto keys = extractor.extractAllKeys(); diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 5fd1bd420c6..d6d04075a3d 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -553,6 +553,7 @@ private: ContainerType, ContainerType, ContainerType, + ContainerType, ContainerType, ContainerType, ContainerType, diff --git a/src/Dictionaries/DictionarySource.cpp b/src/Dictionaries/DictionarySource.cpp index d3058db87f4..526d9fc85cd 100644 --- a/src/Dictionaries/DictionarySource.cpp +++ b/src/Dictionaries/DictionarySource.cpp @@ -60,8 +60,8 @@ private: const auto & attributes_types_to_read = coordinator->getAttributesTypesToRead(); const auto & attributes_default_values_columns = coordinator->getAttributesDefaultValuesColumns(); - const auto & dictionary = coordinator->getDictionary(); - auto attributes_columns = dictionary->getColumns( + const auto & read_columns_func = coordinator->getReadColumnsFunc(); + auto attributes_columns = read_columns_func( attributes_names_to_read, attributes_types_to_read, key_columns, diff --git a/src/Dictionaries/DictionarySource.h b/src/Dictionaries/DictionarySource.h index 0237e1338df..7809c958419 100644 --- a/src/Dictionaries/DictionarySource.h +++ b/src/Dictionaries/DictionarySource.h @@ -19,6 +19,8 @@ class DictionarySourceCoordinator final : public shared_ptr_helper; + Pipe read(size_t num_streams); private: @@ -31,6 +33,15 @@ private: : dictionary(std::move(dictionary_)) , key_columns_with_type(std::move(key_columns_with_type_)) , max_block_size(max_block_size_) + , read_columns_func([this]( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) + { + return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns); + }) { initialize(column_names); } @@ -45,6 +56,31 @@ private: , key_columns_with_type(std::move(key_columns_with_type_)) , data_columns_with_type(std::move(data_columns_with_type_)) , max_block_size(max_block_size_) + , read_columns_func([this]( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) + { + return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns); + }) + { + initialize(column_names); + } + + explicit DictionarySourceCoordinator( + std::shared_ptr dictionary_, + const Names & column_names, + ColumnsWithTypeAndName && key_columns_with_type_, + ColumnsWithTypeAndName && data_columns_with_type_, + size_t max_block_size_, + ReadColumnsFunc read_columns_func_) + : dictionary(std::move(dictionary_)) + , key_columns_with_type(std::move(key_columns_with_type_)) + , data_columns_with_type(std::move(data_columns_with_type_)) + , max_block_size(max_block_size_) + , read_columns_func(std::move(read_columns_func_)) { initialize(column_names); } @@ -61,6 +97,8 @@ private: const std::vector & getAttributesDefaultValuesColumns() const { return attributes_default_values_columns; } + const ReadColumnsFunc & getReadColumnsFunc() const { return read_columns_func; } + const std::shared_ptr & getDictionary() const { return dictionary; } void initialize(const Names & column_names); @@ -79,6 +117,8 @@ private: std::vector attributes_default_values_columns; const size_t max_block_size; + ReadColumnsFunc read_columns_func; + std::atomic parallel_read_block_index = 0; }; diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index c16ed139023..3e29f3efe76 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -50,7 +50,6 @@ std::optional tryGetAttributeUnderlyingType(TypeIndex i case TypeIndex::Date: return AttributeUnderlyingType::UInt16; case TypeIndex::Date32: return AttributeUnderlyingType::Int32; case TypeIndex::DateTime: return AttributeUnderlyingType::UInt32; - case TypeIndex::DateTime64: return AttributeUnderlyingType::Int64; default: break; } @@ -383,7 +382,8 @@ std::vector DictionaryStructure::getAttributes( void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix) { - const char * range_default_type = "Date"; + static constexpr auto range_default_type = "Date"; + if (config.has(structure_prefix + ".range_min")) range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type)); @@ -396,7 +396,10 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf "Dictionary structure should have both 'range_min' and 'range_max' either specified or not."); } - if (range_min && range_max && !range_min->type->equals(*range_max->type)) + if (!range_min) + return; + + if (!range_min->type->equals(*range_max->type)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary structure 'range_min' and 'range_max' should have same type, " @@ -406,15 +409,20 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf range_max->type->getName()); } - if (range_min && !range_min->type->isValueRepresentedByInteger()) + WhichDataType range_type(range_min->type); + + bool valid_range = range_type.isInt() || range_type.isUInt() || range_type.isDecimal() || range_type.isFloat() || range_type.isEnum() + || range_type.isDate() || range_type.isDate32() || range_type.isDateTime() || range_type.isDateTime64(); + + if (!valid_range) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." + "Dictionary structure type of 'range_min' and 'range_max' should be an Integer, Float, Decimal, Date, Date32, DateTime DateTime64, or Enum." " Actual 'range_min' and 'range_max' type is {}", range_min->type->getName()); } - if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty())) + if (!range_min->expression.empty() || !range_max->expression.empty()) has_expressions = true; } diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 3a51fc688ec..9014b09b072 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -37,6 +37,7 @@ enum class AttributeUnderlyingType : TypeIndexUnderlying map_item(UInt8), map_item(UInt16), map_item(UInt32), map_item(UInt64), map_item(UInt128), map_item(UInt256), map_item(Float32), map_item(Float64), map_item(Decimal32), map_item(Decimal64), map_item(Decimal128), map_item(Decimal256), + map_item(DateTime64), map_item(UUID), map_item(String), map_item(Array) }; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index 9bf6bf97c8d..40cc735557c 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -291,30 +291,52 @@ void FlatDictionary::blockToAttributes(const Block & block) DictionaryKeysArenaHolder arena_holder; DictionaryKeysExtractor keys_extractor({ keys_column }, arena_holder.getComplexKeyArena()); - auto keys = keys_extractor.extractAllKeys(); + size_t keys_size = keys_extractor.getKeysSize(); - HashSet already_processed_keys; + static constexpr size_t key_offset = 1; - size_t key_offset = 1; - for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + size_t attributes_size = attributes.size(); + + if (unlikely(attributes_size == 0)) + { + for (size_t i = 0; i < keys_size; ++i) + { + auto key = keys_extractor.extractCurrentKey(); + + if (unlikely(key >= configuration.max_array_size)) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "{}: identifier should be less than {}", + getFullName(), + toString(configuration.max_array_size)); + + if (key >= loaded_keys.size()) + { + const size_t elements_count = key + 1; + loaded_keys.resize(elements_count, false); + } + + loaded_keys[key] = true; + + keys_extractor.rollbackCurrentKey(); + } + + return; + } + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) { const IColumn & attribute_column = *block.safeGetByPosition(attribute_index + key_offset).column; Attribute & attribute = attributes[attribute_index]; - for (size_t i = 0; i < keys.size(); ++i) + for (size_t i = 0; i < keys_size; ++i) { - auto key = keys[i]; - - if (already_processed_keys.find(key) != nullptr) - continue; - - already_processed_keys.insert(key); + auto key = keys_extractor.extractCurrentKey(); setAttributeValue(attribute, key, attribute_column[i]); - ++element_count; + keys_extractor.rollbackCurrentKey(); } - already_processed_keys.clear(); + keys_extractor.reset(); } } @@ -369,6 +391,12 @@ void FlatDictionary::loadData() else updateData(); + element_count = 0; + + size_t loaded_keys_size = loaded_keys.size(); + for (size_t i = 0; i < loaded_keys_size; ++i) + element_count += loaded_keys[i]; + if (configuration.require_nonempty && 0 == element_count) throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set.", getFullName()); } @@ -495,21 +523,6 @@ void FlatDictionary::resize(Attribute & attribute, UInt64 key) } } -template -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const T & value) -{ - auto & array = std::get>(attribute.container); - array[key] = value; - loaded_keys[key] = true; -} - -template <> -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const String & value) -{ - auto arena_value = copyStringInArena(string_arena, value); - setAttributeValueImpl(attribute, key, arena_value); -} - void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 key, const Field & value) { auto type_call = [&](const auto & dictionary_attribute_type) @@ -520,17 +533,27 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 key, resize(attribute, key); - if (attribute.is_nullable_set) + if (attribute.is_nullable_set && value.isNull()) { - if (value.isNull()) - { - attribute.is_nullable_set->insert(key); - loaded_keys[key] = true; - return; - } + attribute.is_nullable_set->insert(key); + loaded_keys[key] = true; + return; } - setAttributeValueImpl(attribute, key, value.get()); + auto & attribute_value = value.get(); + + auto & container = std::get>(attribute.container); + loaded_keys[key] = true; + + if constexpr (std::is_same_v) + { + auto arena_value = copyStringInArena(string_arena, attribute_value); + container[key] = arena_value; + } + else + { + container[key] = attribute_value; + } }; callOnDictionaryAttributeType(attribute.type, type_call); diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index e8f40ea1d66..2578fef3ecb 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -127,6 +127,7 @@ private: ContainerType, ContainerType, ContainerType, + ContainerType, ContainerType, ContainerType, ContainerType, @@ -154,9 +155,6 @@ private: template void resize(Attribute & attribute, UInt64 key); - template - void setAttributeValueImpl(Attribute & attribute, UInt64 key, const T & value); - void setAttributeValue(Attribute & attribute, UInt64 key, const Field & value); const DictionaryStructure dict_struct; diff --git a/src/Dictionaries/HashedArrayDictionary.cpp b/src/Dictionaries/HashedArrayDictionary.cpp index 55a3adc32ae..e35340c7618 100644 --- a/src/Dictionaries/HashedArrayDictionary.cpp +++ b/src/Dictionaries/HashedArrayDictionary.cpp @@ -158,12 +158,6 @@ ColumnUInt8::Ptr HashedArrayDictionary::hasKeys(const Colum auto result = ColumnUInt8::create(keys_size, false); auto & out = result->getData(); - if (attributes.empty()) - { - query_count.fetch_add(keys_size, std::memory_order_relaxed); - return result; - } - size_t keys_found = 0; for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index) diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 80436a3d044..a649fddcc39 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -147,6 +147,7 @@ private: AttributeContainerType, AttributeContainerType, AttributeContainerType, + AttributeContainerType, AttributeContainerType, AttributeContainerType, AttributeContainerType, diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 8417455087e..c83735a6330 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -177,15 +177,25 @@ ColumnUInt8::Ptr HashedDictionary::hasKeys(const Co auto result = ColumnUInt8::create(keys_size, false); auto & out = result->getData(); - if (attributes.empty()) + size_t keys_found = 0; + + if (unlikely(attributes.empty())) { + for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index) + { + auto requested_key = extractor.extractCurrentKey(); + out[requested_key_index] = no_attributes_container.find(requested_key) != no_attributes_container.end(); + keys_found += out[requested_key_index]; + extractor.rollbackCurrentKey(); + } + query_count.fetch_add(keys_size, std::memory_order_relaxed); + found_count.fetch_add(keys_found, std::memory_order_relaxed); return result; } const auto & attribute = attributes.front(); bool is_attribute_nullable = attribute.is_nullable_set.has_value(); - size_t keys_found = 0; getAttributeContainer(0, [&](const auto & container) { @@ -423,7 +433,25 @@ void HashedDictionary::blockToAttributes(const Bloc Field column_value_to_insert; - for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + size_t attributes_size = attributes.size(); + + if (unlikely(attributes_size == 0)) + { + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + if constexpr (std::is_same_v) + key = copyStringInArena(string_arena, key); + + no_attributes_container.insert(key); + keys_extractor.rollbackCurrentKey(); + } + + return; + } + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) { const IColumn & attribute_column = *block.safeGetByPosition(skip_keys_size_offset + attribute_index).column; auto & attribute = attributes[attribute_index]; @@ -487,7 +515,21 @@ void HashedDictionary::resize(size_t added_rows) if (unlikely(!added_rows)) return; - for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + size_t attributes_size = attributes.size(); + + if (unlikely(attributes_size == 0)) + { + size_t reserve_size = added_rows + no_attributes_container.size(); + + if constexpr (sparse) + no_attributes_container.resize(reserve_size); + else + no_attributes_container.reserve(reserve_size); + + return; + } + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) { getAttributeContainer(attribute_index, [added_rows](auto & attribute_map) { @@ -570,7 +612,9 @@ void HashedDictionary::loadData() } } else + { resize(block.rows()); + } blockToAttributes(block); } @@ -589,9 +633,10 @@ void HashedDictionary::loadData() template void HashedDictionary::calculateBytesAllocated() { - bytes_allocated += attributes.size() * sizeof(attributes.front()); + size_t attributes_size = attributes.size(); + bytes_allocated += attributes_size * sizeof(attributes.front()); - for (size_t i = 0; i < attributes.size(); ++i) + for (size_t i = 0; i < attributes_size; ++i) { getAttributeContainer(i, [&](const auto & container) { @@ -622,6 +667,22 @@ void HashedDictionary::calculateBytesAllocated() bytes_allocated = attributes[i].is_nullable_set->getBufferSizeInBytes(); } + if (unlikely(attributes_size == 0)) + { + bytes_allocated += sizeof(no_attributes_container); + + if constexpr (sparse) + { + bytes_allocated += no_attributes_container.size() * (sizeof(KeyType)); + bucket_count = no_attributes_container.bucket_count(); + } + else + { + bytes_allocated += no_attributes_container.getBufferSizeInBytes(); + bucket_count = no_attributes_container.getBufferSizeInCells(); + } + } + bytes_allocated += string_arena.size(); if (update_field_loaded_block) @@ -657,6 +718,18 @@ Pipe HashedDictionary::read(const Names & column_na } }); } + else + { + keys.reserve(no_attributes_container.size()); + + for (const auto & key : no_attributes_container) + { + if constexpr (sparse) + keys.emplace_back(key); + else + keys.emplace_back(key.getKey()); + } + } ColumnsWithTypeAndName key_columns; diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index c1761944b14..1ef1c58b67c 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -120,9 +121,14 @@ private: template using CollectionTypeNonSparse = std::conditional_t< dictionary_key_type == DictionaryKeyType::Simple, - HashMap, + HashMap>, HashMapWithSavedHash>>; + using NoAttributesCollectionTypeNonSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::Simple, + HashSet>, + HashSetWithSavedHash>>; + /// Here we use sparse_hash_map with DefaultHash<> for the following reasons: /// /// - DefaultHash<> is used for HashMap @@ -140,9 +146,13 @@ private: google::sparse_hash_map>, google::sparse_hash_map>>; + using NoAttributesCollectionTypeSparse = google::sparse_hash_set>; + template using CollectionType = std::conditional_t, CollectionTypeNonSparse>; + using NoAttributesCollectionType = std::conditional_t; + using NullableSet = HashSet>; struct Attribute final @@ -167,6 +177,7 @@ private: CollectionType, CollectionType, CollectionType, + CollectionType, CollectionType, CollectionType, CollectionType, @@ -214,6 +225,7 @@ private: BlockPtr update_field_loaded_block; Arena string_arena; + NoAttributesCollectionType no_attributes_container; }; extern template class HashedDictionary; diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index 33a9989a9e5..8dddc988caa 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -114,6 +114,7 @@ private: Decimal64, Decimal128, Decimal256, + DateTime64, Float32, Float64, UUID, @@ -137,6 +138,7 @@ private: ContainerType, ContainerType, ContainerType, + ContainerType, ContainerType, ContainerType, ContainerType, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index e5c08b52881..14c8fc7c749 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -1,57 +1,22 @@ #include -#include -#include -#include +#include #include +#include #include #include #include +#include + +#include + +#include +#include + #include #include -namespace -{ - -using RangeStorageType = DB::RangeStorageType; - -// Null values mean that specified boundary, either min or max is not set on range. -// To simplify comparison, null value of min bound should be bigger than any other value, -// and null value of maxbound - less than any value. -const RangeStorageType RANGE_MIN_NULL_VALUE = std::numeric_limits::max(); -const RangeStorageType RANGE_MAX_NULL_VALUE = std::numeric_limits::lowest(); - -bool isCorrectDate(const RangeStorageType & date) -{ - return 0 < date && date <= DATE_LUT_MAX_DAY_NUM; -} - -// Handle both kinds of null values: explicit nulls of NullableColumn and 'implicit' nulls of Date type. -RangeStorageType getColumnIntValueOrDefault(const DB::IColumn & column, size_t index, bool isDate, const RangeStorageType & default_value) -{ - if (column.isNullAt(index)) - return default_value; - - const RangeStorageType result = static_cast(column.getInt(index)); - if (isDate && !isCorrectDate(result)) - return default_value; - - return result; -} - -const DB::IColumn & unwrapNullableColumn(const DB::IColumn & column) -{ - if (const auto * m = DB::checkAndGetColumn(&column)) - { - return m->getNestedColumn(); - } - - return column; -} - -} - namespace DB { namespace ErrorCodes @@ -60,22 +25,53 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; extern const int UNSUPPORTED_METHOD; + extern const int TYPE_MISMATCH; } +namespace +{ + template + void callOnRangeType(const DataTypePtr & range_type, F && func) + { + auto call = [&](const auto & types) + { + using Types = std::decay_t; + using DataType = typename Types::LeftType; + + if constexpr (IsDataTypeDecimalOrNumber || IsDataTypeDateOrDateTime || IsDataTypeEnum) + { + using ColumnType = typename DataType::ColumnType; + func(TypePair()); + return true; + } + + return false; + }; + + auto type_index = range_type->getTypeId(); + if (!callOnIndexAndDataType(type_index, call)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure type of 'range_min' and 'range_max' should be an Integer, Float, Decimal, Date, Date32, DateTime DateTime64, or Enum." + " Actual 'range_min' and 'range_max' type is {}", + range_type->getName()); + } + } +} template RangeHashedDictionary::RangeHashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, + DictionaryLifetime dict_lifetime_, + RangeHashedDictionaryConfiguration configuration_, BlockPtr update_field_loaded_block_) : IDictionary(dict_id_) , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} + , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) - , require_nonempty(require_nonempty_) + , configuration(configuration_) , update_field_loaded_block(std::move(update_field_loaded_block_)) { createAttributes(); @@ -104,15 +100,14 @@ ColumnPtr RangeHashedDictionary::getColumn( const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; const auto & attribute = attributes[attribute_index]; - /// Cast second column to storage type + /// Cast range column to storage type Columns modified_key_columns = key_columns; auto range_storage_column = key_columns.back(); ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""}; - auto range_column_storage_type = std::make_shared(); - modified_key_columns.back() = castColumnAccurate(column_to_cast, range_column_storage_type); + modified_key_columns.back() = castColumnAccurate(column_to_cast, dict_struct.range_min->type); size_t keys_size = key_columns.front()->size(); - bool is_attribute_nullable = attribute.is_nullable; + bool is_attribute_nullable = attribute.is_value_nullable.has_value(); ColumnUInt8::MutablePtr col_null_map_to; ColumnUInt8::Container * vec_null_map_to = nullptr; @@ -122,7 +117,7 @@ ColumnPtr RangeHashedDictionary::getColumn( vec_null_map_to = &col_null_map_to->getData(); } - auto type_call = [&](const auto &dictionary_attribute_type) + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; @@ -206,6 +201,106 @@ ColumnPtr RangeHashedDictionary::getColumn( return result; } +template +ColumnPtr RangeHashedDictionary::getColumnInternal( + const std::string & attribute_name, + const DataTypePtr & result_type, + const PaddedPODArray & key_to_index) const +{ + ColumnPtr result; + + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute = attributes[attribute_index]; + + size_t keys_size = key_to_index.size(); + bool is_attribute_nullable = attribute.is_value_nullable.has_value(); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (is_attribute_nullable) + { + col_null_map_to = ColumnUInt8::create(keys_size, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnProvider = DictionaryAttributeColumnProvider; + + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + auto * out = column.get(); + + getItemsInternalImpl( + attribute, + key_to_index, + [&](size_t, const Array & value, bool) + { + out->insert(value); + }); + } + else if constexpr (std::is_same_v) + { + auto * out = column.get(); + + if (is_attribute_nullable) + getItemsInternalImpl( + attribute, + key_to_index, + [&](size_t row, const StringRef value, bool is_null) + { + (*vec_null_map_to)[row] = is_null; + out->insertData(value.data, value.size); + }); + else + getItemsInternalImpl( + attribute, + key_to_index, + [&](size_t, const StringRef value, bool) + { + out->insertData(value.data, value.size); + }); + } + else + { + auto & out = column->getData(); + + if (is_attribute_nullable) + getItemsInternalImpl( + attribute, + key_to_index, + [&](size_t row, const auto value, bool is_null) + { + (*vec_null_map_to)[row] = is_null; + out[row] = value; + }); + else + getItemsInternalImpl( + attribute, + key_to_index, + [&](size_t row, const auto value, bool) + { + out[row] = value; + }); + } + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + if (is_attribute_nullable) + result = ColumnNullable::create(std::move(result), std::move(col_null_map_to)); + + return result; +} + template ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { @@ -216,41 +311,45 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Colum dict_struct.validateKeyTypes(key_types_copy); } - auto range_column_storage_type = std::make_shared(); + /// Cast range column to storage type auto range_storage_column = key_columns.back(); ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""}; - auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type); - PaddedPODArray range_backup_storage; - const PaddedPODArray & dates = getColumnVectorData(this, range_column_updated, range_backup_storage); - + auto range_column_updated = castColumnAccurate(column_to_cast, dict_struct.range_min->type); auto key_columns_copy = key_columns; key_columns_copy.pop_back(); + DictionaryKeysArenaHolder arena_holder; DictionaryKeysExtractor keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena()); const size_t keys_size = keys_extractor.getKeysSize(); - const auto & attribute = attributes.front(); - auto result = ColumnUInt8::create(keys_size); auto & out = result->getData(); size_t keys_found = 0; - auto type_call = [&](const auto & dictionary_attribute_type) + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; - const auto & collection = std::get>(attribute.maps); + const auto * range_column_typed = typeid_cast(range_column_updated.get()); + if (!range_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range column type should be equal to {}", + getFullName(), + dict_struct.range_min->type->getName()); + const auto & range_column_data = range_column_typed->getData(); + + const auto & key_attribute_container = std::get>(key_attribute.container); for (size_t key_index = 0; key_index < keys_size; ++key_index) { const auto key = keys_extractor.extractCurrentKey(); - const auto it = collection.find(key); + const auto it = key_attribute_container.find(key); if (it) { - const auto date = dates[key_index]; + const auto date = range_column_data[key_index]; const auto & interval_tree = it->getMapped(); out[key_index] = interval_tree.has(date); keys_found += out[key_index]; @@ -262,9 +361,7 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Colum keys_extractor.rollbackCurrentKey(); } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); + }); query_count.fetch_add(keys_size, std::memory_order_relaxed); found_count.fetch_add(keys_found, std::memory_order_relaxed); @@ -286,6 +383,16 @@ void RangeHashedDictionary::createAttributes() throw Exception(ErrorCodes::BAD_ARGUMENTS, "Hierarchical attributes not supported by {} dictionary.", getDictionaryID().getNameForLogs()); } + + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) + { + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; + + key_attribute.container = KeyAttributeContainerType(); + key_attribute.invalid_intervals_container = InvalidIntervalsContainerType(); + }); } template @@ -294,9 +401,9 @@ void RangeHashedDictionary::loadData() if (!source_ptr->hasUpdateField()) { QueryPipeline pipeline(source_ptr->loadAll()); - PullingPipelineExecutor executor(pipeline); Block block; + while (executor.pull(block)) { blockToAttributes(block); @@ -307,9 +414,19 @@ void RangeHashedDictionary::loadData() updateData(); } - buildAttributeIntervalTrees(); + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) + { + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; - if (require_nonempty && 0 == element_count) + auto & key_attribute_container = std::get>(key_attribute.container); + + for (auto & [_, intervals] : key_attribute_container) + intervals.build(); + }); + + if (configuration.require_nonempty && 0 == element_count) throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set."); } @@ -317,8 +434,22 @@ void RangeHashedDictionary::loadData() template void RangeHashedDictionary::calculateBytesAllocated() { - bytes_allocated += attributes.size() * sizeof(attributes.front()); + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) + { + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; + auto & key_attribute_container = std::get>(key_attribute.container); + + bucket_count = key_attribute_container.getBufferSizeInCells(); + bytes_allocated += key_attribute_container.getBufferSizeInBytes(); + + for (auto & [_, intervals] : key_attribute_container) + bytes_allocated += intervals.getSizeInBytes(); + }); + + bytes_allocated += attributes.size() * sizeof(attributes.front()); for (const auto & attribute : attributes) { auto type_call = [&](const auto & dictionary_attribute_type) @@ -327,17 +458,17 @@ void RangeHashedDictionary::calculateBytesAllocated() using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - const auto & collection = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + collection.getBufferSizeInBytes(); - bucket_count = collection.getBufferSizeInCells(); + const auto & container = std::get>(attribute.container); + + bytes_allocated += container.size() * sizeof(ValueType); + + if (attribute.is_value_nullable) + bytes_allocated += (*attribute.is_value_nullable).size() * sizeof(bool); }; callOnDictionaryAttributeType(attribute.type, type_call); } - if constexpr (dictionary_key_type == DictionaryKeyType::Complex) - bytes_allocated += complex_key_arena.size(); - if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); @@ -347,15 +478,20 @@ void RangeHashedDictionary::calculateBytesAllocated() template typename RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute) { - Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}}; + std::optional> is_value_nullable; - auto type_call = [&](const auto &dictionary_attribute_type) + if (dictionary_attribute.is_nullable) + is_value_nullable.emplace(std::vector()); + + Attribute attribute{dictionary_attribute.underlying_type, {}, std::move(is_value_nullable)}; + + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - attribute.maps = CollectionType(); + attribute.container = AttributeContainerType(); }; callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call); @@ -371,80 +507,158 @@ void RangeHashedDictionary::getItemsImpl( ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { - const auto & collection = std::get>(attribute.maps); + const auto & attribute_container = std::get>(attribute.container); size_t keys_found = 0; - PaddedPODArray range_backup_storage; - const auto & dates = getColumnVectorData(this, key_columns.back(), range_backup_storage); - + auto range_column = key_columns.back(); auto key_columns_copy = key_columns; key_columns_copy.pop_back(); + DictionaryKeysArenaHolder arena_holder; DictionaryKeysExtractor keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena()); const size_t keys_size = keys_extractor.getKeysSize(); - for (size_t key_index = 0; key_index < keys_size; ++key_index) + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) { - auto key = keys_extractor.extractCurrentKey(); - const auto it = collection.find(key); + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; + using RangeInterval = Interval; - if (it) + const auto * range_column_typed = typeid_cast(range_column.get()); + if (!range_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range column type should be equal to {}", + getFullName(), + dict_struct.range_min->type->getName()); + + const auto & range_column_data = range_column_typed->getData(); + + const auto & key_attribute_container = std::get>(key_attribute.container); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) { - const auto date = dates[key_index]; - const auto & interval_tree = it->getMapped(); + auto key = keys_extractor.extractCurrentKey(); + const auto it = key_attribute_container.find(key); - std::optional min_value; - std::optional min_range; - bool has_interval = false; - - interval_tree.find(date, [&](auto & interval, auto & value) + if (it) { - has_interval = true; + const auto date = range_column_data[key_index]; + const auto & interval_tree = it->getMapped(); - if (min_range && interval < *min_range) - min_range = interval; - else - min_range = interval; + size_t value_index = 0; + std::optional range; - min_value = value; - - return true; - }); - - if (has_interval) - { - ++keys_found; - - if constexpr (is_nullable) + interval_tree.find(date, [&](auto & interval, auto & interval_value_index) { - if (min_value.has_value()) - set_value(key_index, *min_value, false); + if (range) + { + if (likely(configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::min) && interval < *range) + { + range = interval; + value_index = interval_value_index; + } + else if (configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::max && interval > * range) + { + range = interval; + value_index = interval_value_index; + } + } else - set_value(key_index, default_value_extractor[key_index], true); - } - else + { + range = interval; + value_index = interval_value_index; + } + + return true; + }); + + if (range.has_value()) { - set_value(key_index, *min_value, false); + ++keys_found; + + AttributeType value = attribute_container[value_index]; + + if constexpr (is_nullable) + { + bool is_null = (*attribute.is_value_nullable)[value_index]; + + if (!is_null) + set_value(key_index, value, false); + else + set_value(key_index, default_value_extractor[key_index], true); + } + else + { + set_value(key_index, value, false); + } + + keys_extractor.rollbackCurrentKey(); + continue; } - - keys_extractor.rollbackCurrentKey(); - continue; } + + if constexpr (is_nullable) + set_value(key_index, default_value_extractor[key_index], default_value_extractor.isNullAt(key_index)); + else + set_value(key_index, default_value_extractor[key_index], false); + + keys_extractor.rollbackCurrentKey(); } - - if constexpr (is_nullable) - set_value(key_index, default_value_extractor[key_index], default_value_extractor.isNullAt(key_index)); - else - set_value(key_index, default_value_extractor[key_index], false); - - keys_extractor.rollbackCurrentKey(); - } + }); query_count.fetch_add(keys_size, std::memory_order_relaxed); found_count.fetch_add(keys_found, std::memory_order_relaxed); } +template +template +void RangeHashedDictionary::getItemsInternalImpl( + const Attribute & attribute, + const PaddedPODArray & key_to_index, + ValueSetter && set_value) const +{ + size_t keys_size = key_to_index.size(); + + const auto & container = std::get>(attribute.container); + size_t container_size = container.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + UInt64 container_index = key_to_index[key_index]; + + if (unlikely(container_index >= container_size)) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Dictionary {} expected attribute container index {} must be less than attribute container size {}", + getFullName(), + container_index, + container_size + ); + } + + AttributeType value = container[container_index]; + + if constexpr (is_nullable) + { + bool is_null = (*attribute.is_value_nullable)[container_index]; + + if (!is_null) + set_value(key_index, value, false); + else + set_value(key_index, value, true); + } + else + { + set_value(key_index, value, false); + } + } + + query_count.fetch_add(keys_size, std::memory_order_relaxed); + found_count.fetch_add(keys_size, std::memory_order_relaxed); +} + template void RangeHashedDictionary::updateData() { @@ -486,281 +700,378 @@ void RangeHashedDictionary::updateData() } template -void RangeHashedDictionary::blockToAttributes(const Block & block [[maybe_unused]]) +void RangeHashedDictionary::blockToAttributes(const Block & block) { - size_t skip_keys_size_offset = dict_struct.getKeysSize(); + size_t attributes_size = attributes.size(); + size_t dictionary_keys_size = dict_struct.getKeysSize(); + + static constexpr size_t ranges_size = 2; + + size_t block_columns = block.columns(); + size_t range_dictionary_attributes_size = attributes_size + dictionary_keys_size + ranges_size; + + if (range_dictionary_attributes_size != block.columns()) + { + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Block size mismatch. Actual {}. Expected {}", + block_columns, + range_dictionary_attributes_size); + } Columns key_columns; - key_columns.reserve(skip_keys_size_offset); + key_columns.reserve(dictionary_keys_size); /// Split into keys columns and attribute columns - for (size_t i = 0; i < skip_keys_size_offset; ++i) - key_columns.emplace_back(block.safeGetByPosition(i).column); + for (size_t i = 0; i < dictionary_keys_size; ++i) + key_columns.emplace_back(block.getByPosition(i).column); DictionaryKeysArenaHolder arena_holder; DictionaryKeysExtractor keys_extractor(key_columns, arena_holder.getComplexKeyArena()); const size_t keys_size = keys_extractor.getKeysSize(); - element_count += keys_size; + size_t block_attributes_skip_offset = dictionary_keys_size; - // Support old behaviour, where invalid date means 'open range'. - const bool is_date = isDate(dict_struct.range_min->type); + const auto * min_range_column = block.getByPosition(block_attributes_skip_offset).column.get(); + const auto * max_range_column = block.getByPosition(block_attributes_skip_offset + 1).column.get(); - const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset).column); - const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset + 1).column); + const NullMap * min_range_null_map = nullptr; + const NullMap * max_range_null_map = nullptr; - skip_keys_size_offset += 2; - - for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + if (const auto * min_range_column_nullable = checkAndGetColumn(min_range_column)) { - const auto & attribute_column = *block.safeGetByPosition(attribute_index + skip_keys_size_offset).column; - auto & attribute = attributes[attribute_index]; + min_range_column = &min_range_column_nullable->getNestedColumn(); + min_range_null_map = &min_range_column_nullable->getNullMapColumn().getData(); + } + + if (const auto * max_range_column_nullable = checkAndGetColumn(max_range_column)) + { + max_range_column = &max_range_column_nullable->getNestedColumn(); + max_range_null_map = &max_range_column_nullable->getNullMapColumn().getData(); + } + + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) + { + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; + + const auto * min_range_column_typed = typeid_cast(min_range_column); + if (!min_range_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range min column type should be equal to {}", + getFullName(), + dict_struct.range_min->type->getName()); + + const auto * max_range_column_typed = typeid_cast(max_range_column); + if (!max_range_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range max column type should be equal to {}", + getFullName(), + dict_struct.range_max->type->getName()); + + const auto & min_range_column_data = min_range_column_typed->getData(); + const auto & max_range_column_data = max_range_column_typed->getData(); + + auto & key_attribute_container = std::get>(key_attribute.container); + auto & invalid_intervals_container = std::get>(key_attribute.invalid_intervals_container); + + block_attributes_skip_offset += 2; + + Field column_value; for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys_extractor.extractCurrentKey(); - RangeStorageType lower_bound; - RangeStorageType upper_bound; + RangeStorageType lower_bound = min_range_column_data[key_index]; + RangeStorageType upper_bound = max_range_column_data[key_index]; - if (is_date) + bool invalid_range = false; + + if (unlikely(min_range_null_map && (*min_range_null_map)[key_index])) { - lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, 0); - upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, DATE_LUT_MAX_DAY_NUM + 1); + lower_bound = std::numeric_limits::min(); + invalid_range = true; } - else + + if (unlikely(max_range_null_map && (*max_range_null_map)[key_index])) { - lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, RANGE_MIN_NULL_VALUE); - upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, RANGE_MAX_NULL_VALUE); + upper_bound = std::numeric_limits::max(); + invalid_range = true; + } + + if (unlikely(!configuration.convert_null_range_bound_to_open && invalid_range)) + { + keys_extractor.rollbackCurrentKey(); + continue; } if constexpr (std::is_same_v) key = copyStringInArena(string_arena, key); - if (likely(lower_bound <= upper_bound)) - setAttributeValue(attribute, key, RangeInterval{lower_bound, upper_bound}, attribute_column[key_index]); + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + const auto & attribute_column = *block.getByPosition(attribute_index + block_attributes_skip_offset).column; + auto & attribute = attributes[attribute_index]; + attribute_column.get(key_index, column_value); + setAttributeValue(attribute, column_value); + } + + auto interval = Interval(lower_bound, upper_bound); + auto it = key_attribute_container.find(key); + + bool emplaced_in_interval_tree = false; + + if (it) + { + auto & intervals = it->getMapped(); + emplaced_in_interval_tree = intervals.emplace(interval, element_count); + } + else + { + IntervalMap intervals; + emplaced_in_interval_tree = intervals.emplace(interval, element_count); + key_attribute_container.insert({key, std::move(intervals)}); + } + + if (unlikely(!emplaced_in_interval_tree)) + { + InvalidIntervalWithKey invalid_interval{key, interval, element_count}; + invalid_intervals_container.emplace_back(invalid_interval); + } + + ++element_count; keys_extractor.rollbackCurrentKey(); } - - keys_extractor.reset(); - } + }); } template -void RangeHashedDictionary::buildAttributeIntervalTrees() +void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Field & value) { - for (auto & attribute : attributes) - { - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - - auto & collection = std::get>(attribute.maps); - for (auto & [_, ranges] : collection) - ranges.build(); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - } -} - -template -template -void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value) -{ - using ValueType = std::conditional_t, StringRef, T>; - auto & collection = std::get>(attribute.maps); - - std::optional value_to_insert; - - if (attribute.is_nullable && value.isNull()) - { - value_to_insert = std::nullopt; - } - else - { - if constexpr (std::is_same_v) - { - const auto & string = value.get(); - StringRef string_ref = copyStringInArena(string_arena, string); - value_to_insert = { string_ref }; - } - else - { - value_to_insert = { value.get() }; - } - } - - const auto it = collection.find(key); - - if (it) - { - auto & values = it->getMapped(); - values.emplace(interval, std::move(value_to_insert)); - } - else - { - Values values; - values.emplace(interval, value_to_insert); - collection.insert({key, std::move(values)}); - } -} - -template -void RangeHashedDictionary::setAttributeValue(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value) -{ - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - setAttributeValueImpl(attribute, key, interval, value); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); -} - -template -template -void RangeHashedDictionary::getKeysAndDates( - PaddedPODArray & keys, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const -{ - const auto & attribute = attributes.front(); - auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - getKeysAndDates(attribute, keys, start_dates, end_dates); + auto & container = std::get>(attribute.container); + container.emplace_back(); + + if (unlikely(attribute.is_value_nullable.has_value())) + { + bool value_is_null = value.isNull(); + attribute.is_value_nullable->emplace_back(value_is_null); + + if (unlikely(value_is_null)) + return; + } + + ValueType value_to_insert; + + if constexpr (std::is_same_v) + { + const auto & string = value.get(); + StringRef string_ref = copyStringInArena(string_arena, string); + value_to_insert = string_ref; + } + else + { + value_to_insert = value.get(); + } + + container.back() = value_to_insert; }; callOnDictionaryAttributeType(attribute.type, type_call); } -template -template -void RangeHashedDictionary::getKeysAndDates( - const Attribute & attribute, - PaddedPODArray & keys, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const -{ - const auto & collection = std::get>(attribute.maps); - - keys.reserve(collection.size()); - start_dates.reserve(collection.size()); - end_dates.reserve(collection.size()); - - const bool is_date = isDate(dict_struct.range_min->type); - (void)(is_date); - - for (const auto & key : collection) - { - for (const auto & [interval, _] : key.getMapped()) - { - keys.push_back(key.getKey()); - start_dates.push_back(interval.left); - end_dates.push_back(interval.right); - - if constexpr (std::numeric_limits::max() > DATE_LUT_MAX_DAY_NUM) /// Avoid warning about tautological comparison in next line. - if (is_date && static_cast(end_dates.back()) > DATE_LUT_MAX_DAY_NUM) - end_dates.back() = 0; - } - } -} - -template -template -PaddedPODArray RangeHashedDictionary::makeDateKeys( - const PaddedPODArray & block_start_dates, - const PaddedPODArray & block_end_dates) const -{ - PaddedPODArray keys(block_start_dates.size()); - - for (size_t i = 0; i < keys.size(); ++i) - { - if (isCorrectDate(block_start_dates[i])) - keys[i] = block_start_dates[i]; // NOLINT - else - keys[i] = block_end_dates[i]; // NOLINT - } - - return keys; -} - template Pipe RangeHashedDictionary::read(const Names & column_names, size_t max_block_size, size_t num_streams) const { - auto type = dict_struct.range_min->type; + auto key_to_index_column = ColumnUInt64::create(); + auto range_min_column = dict_struct.range_min->type->createColumn(); + auto range_max_column = dict_struct.range_max->type->createColumn(); - ColumnsWithTypeAndName key_columns; - ColumnWithTypeAndName range_min_column; - ColumnWithTypeAndName range_max_column; + PaddedPODArray keys; - auto type_call = [&](const auto & types) mutable -> bool + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) { using Types = std::decay_t; - using LeftDataType = typename Types::LeftType; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; - if constexpr (IsDataTypeNumber || - std::is_same_v || - std::is_same_v || - std::is_same_v) + auto * range_min_column_typed = typeid_cast(range_min_column.get()); + if (!range_min_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range min column type should be equal to {}", + getFullName(), + dict_struct.range_min->type->getName()); + + auto * range_max_column_typed = typeid_cast(range_max_column.get()); + if (!range_max_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range max column type should be equal to {}", + getFullName(), + dict_struct.range_max->type->getName()); + + auto & key_to_index_column_data = key_to_index_column->getData(); + auto & range_min_column_data = range_min_column_typed->getData(); + auto & range_max_column_data = range_max_column_typed->getData(); + + const auto & container = std::get>(key_attribute.container); + const auto & invalid_intervals_container = std::get>(key_attribute.invalid_intervals_container); + + keys.reserve(element_count); + key_to_index_column_data.reserve(element_count); + range_min_column_data.reserve(element_count); + range_max_column_data.reserve(element_count); + + for (const auto & key : container) { - using RangeType = typename LeftDataType::FieldType; - - PaddedPODArray keys; - PaddedPODArray range_start; - PaddedPODArray range_end; - getKeysAndDates(keys, range_start, range_end); - - auto date_column = getColumnFromPODArray(makeDateKeys(range_start, range_end)); - - auto range_start_column = getColumnFromPODArray(std::move(range_start)); - range_min_column = ColumnWithTypeAndName{std::move(range_start_column), dict_struct.range_min->type, dict_struct.range_min->name}; - - auto range_end_column = getColumnFromPODArray(std::move(range_end)); - range_max_column = ColumnWithTypeAndName{std::move(range_end_column), dict_struct.range_max->type, dict_struct.range_max->name}; - - if constexpr (dictionary_key_type == DictionaryKeyType::Simple) + for (const auto & [interval, index] : key.getMapped()) { - auto keys_column = getColumnFromPODArray(std::move(keys)); - key_columns = {ColumnWithTypeAndName(std::move(keys_column), std::make_shared(), dict_struct.id->name)}; + keys.emplace_back(key.getKey()); + key_to_index_column_data.emplace_back(index); + range_min_column_data.push_back(interval.left); + range_max_column_data.push_back(interval.right); } - else - { - key_columns = deserializeColumnsWithTypeAndNameFromKeys(dict_struct, keys, 0, keys.size()); - } - - key_columns.emplace_back(ColumnWithTypeAndName{std::move(date_column), std::make_shared(), ""}); - - return true; } - else + + for (const auto & invalid_interval_with_key : invalid_intervals_container) { - return false; + keys.emplace_back(invalid_interval_with_key.key); + key_to_index_column_data.emplace_back(invalid_interval_with_key.attribute_value_index); + range_min_column_data.push_back(invalid_interval_with_key.interval.left); + range_max_column_data.push_back(invalid_interval_with_key.interval.right); } - }; + }); - if (!callOnIndexAndDataType(type->getTypeId(), type_call)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "RangeHashedDictionary min max range type should be numeric"); + auto range_min_column_with_type = ColumnWithTypeAndName{std::move(range_min_column), dict_struct.range_min->type, dict_struct.range_min->name}; + auto range_max_column_with_type = ColumnWithTypeAndName{std::move(range_max_column), dict_struct.range_max->type, dict_struct.range_max->name}; - ColumnsWithTypeAndName data_columns = {std::move(range_min_column), std::move(range_max_column)}; + ColumnsWithTypeAndName key_columns; + if constexpr (dictionary_key_type == DictionaryKeyType::Simple) + { + auto keys_column = getColumnFromPODArray(std::move(keys)); + key_columns = {ColumnWithTypeAndName(std::move(keys_column), std::make_shared(), dict_struct.id->name)}; + } + else + { + key_columns = deserializeColumnsWithTypeAndNameFromKeys(dict_struct, keys, 0, keys.size()); + } + + key_columns.emplace_back(ColumnWithTypeAndName{std::move(key_to_index_column), std::make_shared(), ""}); + + ColumnsWithTypeAndName data_columns = {std::move(range_min_column_with_type), std::move(range_max_column_with_type)}; std::shared_ptr dictionary = shared_from_this(); - auto coordinator = DictionarySourceCoordinator::create(dictionary, column_names, std::move(key_columns), std::move(data_columns), max_block_size); + + DictionarySourceCoordinator::ReadColumnsFunc read_keys_func = [dictionary_copy = dictionary]( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes, + const Columns &) + { + auto range_dictionary_ptr = std::static_pointer_cast>(dictionary_copy); + + size_t attribute_names_size = attribute_names.size(); + + Columns result; + result.reserve(attribute_names_size); + + auto key_column = key_columns.back(); + + const auto * key_to_index_column = typeid_cast(key_column.get()); + if (!key_to_index_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Dictionary {} read expect indexes column with type UInt64", + range_dictionary_ptr->getFullName()); + + const auto & data = key_to_index_column->getData(); + + for (size_t i = 0; i < attribute_names_size; ++i) + { + const auto & attribute_name = attribute_names[i]; + const auto & result_type = result_types[i]; + + result.emplace_back(range_dictionary_ptr->getColumnInternal(attribute_name, result_type, data)); + } + + return result; + }; + + auto coordinator = DictionarySourceCoordinator::create( + dictionary, + column_names, + std::move(key_columns), + std::move(data_columns), + max_block_size, + std::move(read_keys_func)); auto result = coordinator->read(num_streams); return result; } +template +static DictionaryPtr createRangeHashedDictionary(const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) +{ + static constexpr auto layout_name = dictionary_key_type == DictionaryKeyType::Simple ? "range_hashed" : "complex_key_range_hashed"; + + if constexpr (dictionary_key_type == DictionaryKeyType::Simple) + { + if (dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'range_hashed'"); + } + else + { + if (dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_range_hashed'"); + } + + if (!dict_struct.range_min || !dict_struct.range_max) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "{}: dictionary of layout '{}' requires .structure.range_min and .structure.range_max", + full_name, + layout_name); + + const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); + const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; + const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); + + String dictionary_layout_prefix = config_prefix + ".layout." + layout_name; + const bool convert_null_range_bound_to_open = config.getBool(dictionary_layout_prefix + ".convert_null_range_bound_to_open", true); + String range_lookup_strategy = config.getString(dictionary_layout_prefix + ".range_lookup_strategy", "min"); + RangeHashedDictionaryLookupStrategy lookup_strategy = RangeHashedDictionaryLookupStrategy::min; + + if (range_lookup_strategy == "min") + lookup_strategy = RangeHashedDictionaryLookupStrategy::min; + else if (range_lookup_strategy == "max") + lookup_strategy = RangeHashedDictionaryLookupStrategy::max; + + RangeHashedDictionaryConfiguration configuration + { + .convert_null_range_bound_to_open = convert_null_range_bound_to_open, + .lookup_strategy = lookup_strategy, + .require_nonempty = require_nonempty + }; + + DictionaryPtr result = std::make_unique>( + dict_id, + dict_struct, + std::move(source_ptr), + dict_lifetime, + configuration); + + return result; +} void registerDictionaryRangeHashed(DictionaryFactory & factory) { @@ -772,19 +1083,9 @@ void registerDictionaryRangeHashed(DictionaryFactory & factory) ContextPtr /* global_context */, bool /*created_from_ddl*/) -> DictionaryPtr { - if (dict_struct.key) - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'range_hashed'"); - - if (!dict_struct.range_min || !dict_struct.range_max) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "{}: dictionary of layout 'range_hashed' requires .structure.range_min and .structure.range_max", - full_name); - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + return createRangeHashedDictionary(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); }; + factory.registerLayout("range_hashed", create_layout_simple, false); auto create_layout_complex = [=](const std::string & full_name, @@ -795,19 +1096,9 @@ void registerDictionaryRangeHashed(DictionaryFactory & factory) ContextPtr /* context */, bool /*created_from_ddl*/) -> DictionaryPtr { - if (dict_struct.id) - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_range_hashed'"); - - if (!dict_struct.range_min || !dict_struct.range_max) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "{}: dictionary of layout 'complex_key_range_hashed' requires .structure.range_min and .structure.range_max", - full_name); - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + return createRangeHashedDictionary(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); }; + factory.registerLayout("complex_key_range_hashed", create_layout_complex, true); } diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index f31d6415dc8..78d62e9d7de 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -19,7 +19,18 @@ namespace DB { -using RangeStorageType = Int64; +enum class RangeHashedDictionaryLookupStrategy : uint8_t +{ + min, + max +}; + +struct RangeHashedDictionaryConfiguration +{ + bool convert_null_range_bound_to_open; + RangeHashedDictionaryLookupStrategy lookup_strategy; + bool require_nonempty; +}; template class RangeHashedDictionary final : public IDictionary @@ -31,11 +42,17 @@ public: const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, + DictionaryLifetime dict_lifetime_, + RangeHashedDictionaryConfiguration configuration_, BlockPtr update_field_loaded_block_ = nullptr); - std::string getTypeName() const override { return "RangeHashed"; } + std::string getTypeName() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::Simple) + return "RangeHashed"; + else + return "ComplexKeyRangeHashed"; + } size_t getBytesAllocated() const override { return bytes_allocated; } @@ -57,7 +74,15 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, update_field_loaded_block); + auto result = std::make_shared( + getDictionaryID(), + dict_struct, + source_ptr->clone(), + dict_lifetime, + configuration, + update_field_loaded_block); + + return result; } DictionarySourcePtr getSource() const override { return source_ptr; } @@ -76,7 +101,7 @@ public: DictionarySpecialKeyType getSpecialKeyType() const override { return DictionarySpecialKeyType::Range;} ColumnPtr getColumn( - const std::string& attribute_name, + const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, @@ -88,46 +113,90 @@ public: private: - using RangeInterval = Interval; + template + using IntervalMap = IntervalMap, size_t>; - template - using Values = IntervalMap>; + template + using KeyAttributeContainerType = std::conditional_t< + dictionary_key_type == DictionaryKeyType::Simple, + HashMap, DefaultHash>, + HashMapWithSavedHash, DefaultHash>>; template - using CollectionType = std::conditional_t< - dictionary_key_type == DictionaryKeyType::Simple, - HashMap>, - HashMapWithSavedHash, DefaultHash>>; + using AttributeContainerType = std::conditional_t, std::vector, PaddedPODArray>; struct Attribute final { - public: AttributeUnderlyingType type; - bool is_nullable; std::variant< - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType, - CollectionType> - maps; + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType, + AttributeContainerType> + container; + + std::optional> is_value_nullable; + }; + + template + struct InvalidIntervalWithKey + { + KeyType key; + Interval interval; + size_t attribute_value_index; + }; + + template + using InvalidIntervalsContainerType = PaddedPODArray>; + + template typename ContainerType> + using RangeStorageTypeContainer = std::variant< + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType, + ContainerType>; + + struct KeyAttribute final + { + RangeStorageTypeContainer container; + + RangeStorageTypeContainer invalid_intervals_container; + }; void createAttributes(); @@ -145,43 +214,31 @@ private: ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; + ColumnPtr getColumnInternal( + const std::string & attribute_name, + const DataTypePtr & result_type, + const PaddedPODArray & key_to_index) const; + + template + void getItemsInternalImpl( + const Attribute & attribute, + const PaddedPODArray & key_to_index, + ValueSetter && set_value) const; + void updateData(); void blockToAttributes(const Block & block); - void buildAttributeIntervalTrees(); - - template - void setAttributeValueImpl(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value); - - void setAttributeValue(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value); - - template - void getKeysAndDates( - PaddedPODArray & keys, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const; - - template - void getKeysAndDates( - const Attribute & attribute, - PaddedPODArray & keys, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const; - - template - PaddedPODArray makeDateKeys( - const PaddedPODArray & block_start_dates, - const PaddedPODArray & block_end_dates) const; + void setAttributeValue(Attribute & attribute, const Field & value); const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; - const bool require_nonempty; + const RangeHashedDictionaryConfiguration configuration; BlockPtr update_field_loaded_block; std::vector attributes; - Arena complex_key_arena; + KeyAttribute key_attribute; size_t bytes_allocated = 0; size_t element_count = 0; diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index f741b8242f5..46ea46f85ef 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -65,8 +65,9 @@ std::shared_ptr DiskCacheWrapper::acquireDownloadMetadata( std::unique_lock lock{mutex}; auto it = file_downloads.find(path); - if (it != file_downloads.end() && !it->second.expired()) - return it->second.lock(); + if (it != file_downloads.end()) + if (auto x = it->second.lock()) + return x; std::shared_ptr metadata( new FileDownloadMetadata, diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 3428a9aef54..caa8d44025d 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -29,6 +29,7 @@ namespace ErrorCodes extern const int CANNOT_TRUNCATE_FILE; extern const int CANNOT_UNLINK; extern const int CANNOT_RMDIR; + extern const int BAD_ARGUMENTS; } std::mutex DiskLocal::reservation_mutex; @@ -458,10 +459,16 @@ void registerDiskLocal(DiskFactory & factory) const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, - const DisksMap & /*map*/) -> DiskPtr { + const DisksMap & map) -> DiskPtr { String path; UInt64 keep_free_space_bytes; loadDiskLocalConfig(name, config, config_prefix, context, path, keep_free_space_bytes); + + for (const auto & [disk_name, disk_ptr] : map) + { + if (path == disk_ptr->getPath()) + throw Exception("Disk " + name + " and Disk " + disk_name + " cannot have the same path" + " (" + path + ")", ErrorCodes::BAD_ARGUMENTS); + } return std::make_shared(name, path, keep_free_space_bytes); }; factory.registerDiskType("local", creator); diff --git a/src/Disks/S3/ProxyResolverConfiguration.cpp b/src/Disks/S3/ProxyResolverConfiguration.cpp index 0fc7a9a1fa8..eeac54163b1 100644 --- a/src/Disks/S3/ProxyResolverConfiguration.cpp +++ b/src/Disks/S3/ProxyResolverConfiguration.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace DB::ErrorCodes { @@ -44,13 +45,36 @@ Aws::Client::ClientConfigurationPerRequest ProxyResolverConfiguration::getConfig Poco::Timespan(1000000), /// Send timeout. Poco::Timespan(1000000) /// Receive timeout. ); - auto session = makeHTTPSession(endpoint, timeouts); try { /// It should be just empty GET request. Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, endpoint.getPath(), Poco::Net::HTTPRequest::HTTP_1_1); - session->sendRequest(request); + + const auto & host = endpoint.getHost(); + auto resolved_hosts = DNSResolver::instance().resolveHostAll(host); + + if (resolved_hosts.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Proxy resolver cannot resolve host {}", host); + + HTTPSessionPtr session; + + for (size_t i = 0; i < resolved_hosts.size(); ++i) + { + auto resolved_endpoint = endpoint; + resolved_endpoint.setHost(resolved_hosts[i].toString()); + session = makeHTTPSession(endpoint, timeouts, false); + + try + { + session->sendRequest(request); + } + catch (...) + { + if (i + 1 == resolved_hosts.size()) + throw; + } + } Poco::Net::HTTPResponse response; auto & response_body_stream = session->receiveResponse(response); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index e00a473f584..f19d03dc8d0 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include @@ -394,6 +396,27 @@ void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name target = std::move(non_trivial_prefix_and_suffix_checker); } +void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker) +{ + auto & target = dict[name].append_support_checker; + if (target) + throw Exception("FormatFactory: Suffix checker " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(append_support_checker); +} + +void FormatFactory::markFormatHasNoAppendSupport(const String & name) +{ + registerAppendSupportChecker(name, [](const FormatSettings &){ return false; }); +} + +bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_) +{ + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + auto & append_support_checker = dict[name].append_support_checker; + /// By default we consider that format supports append + return !append_support_checker || append_support_checker(format_settings); +} + void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator) { auto & target = dict[name].output_creator; @@ -410,6 +433,9 @@ void FormatFactory::registerFileExtension(const String & extension, const String String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found) { + if (file_name == "stdin") + return getFormatFromFileDescriptor(STDIN_FILENO); + CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); if (CompressionMethod::None != compression_method) { @@ -438,6 +464,25 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_ return it->second; } +String FormatFactory::getFormatFromFileDescriptor(int fd) +{ +#ifdef OS_LINUX + char buf[32] = {'\0'}; + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd); + char file_path[PATH_MAX] = {'\0'}; + if (readlink(buf, file_path, sizeof(file_path) - 1) != -1) + return getFormatFromFileName(file_path, false); + return ""; +#elif defined(__APPLE__) + char file_path[PATH_MAX] = {'\0'}; + if (fcntl(fd, F_GETPATH, file_path) != -1) + return getFormatFromFileName(file_path, false); + return ""; +#else + return ""; +#endif +} + void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) { auto & target = dict[name].file_segmentation_engine; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index a5eaa43a29f..344dabd3f4d 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -93,6 +93,10 @@ private: /// The checker should return true if parallel parsing should be disabled. using NonTrivialPrefixAndSuffixChecker = std::function; + /// Some formats can support append depending on settings. + /// The checker should return true if format support append. + using AppendSupportChecker = std::function; + using SchemaReaderCreator = std::function; using ExternalSchemaReaderCreator = std::function; @@ -106,6 +110,7 @@ private: bool supports_parallel_formatting{false}; bool is_column_oriented{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; + AppendSupportChecker append_support_checker; }; using FormatsDictionary = std::unordered_map; @@ -167,6 +172,14 @@ public: void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); + void registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker); + + /// If format always doesn't support append, you can use this method instead of + /// registerAppendSupportChecker with append_support_checker that always returns true. + void markFormatHasNoAppendSupport(const String & name); + + bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + /// Register format by its name. void registerInputFormat(const String & name, InputCreator input_creator); void registerOutputFormat(const String & name, OutputCreator output_creator); @@ -174,6 +187,7 @@ public: /// Register file extension for format void registerFileExtension(const String & extension, const String & format_name); String getFormatFromFileName(String file_name, bool throw_if_not_found = false); + String getFormatFromFileDescriptor(int fd); /// Register schema readers for format its name. void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 37067eae64f..559fac4cfaa 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -17,7 +17,12 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional & format_settings, + ReadBufferCreator read_buffer_creator, + ContextPtr context, + std::unique_ptr & buf_out) { NamesAndTypesList names_and_types; if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) @@ -34,11 +39,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o } else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) { - auto read_buf = read_buffer_creator(); - if (read_buf->eof()) + buf_out = read_buffer_creator(); + if (buf_out->eof()) throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name); - auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings); + auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf_out, context, format_settings); try { names_and_types = schema_reader->readSchema(); @@ -54,6 +59,12 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o return ColumnsDescription(names_and_types); } +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +{ + std::unique_ptr buf_out; + return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out); +} + DataTypePtr generalizeDataType(DataTypePtr type) { WhichDataType which(type); diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index fb43acc3cd6..4446393a581 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -15,7 +15,19 @@ namespace DB /// If format doesn't have any schema reader or a schema reader /// couldn't determine the schema, an exception will be thrown. using ReadBufferCreator = std::function()>; -ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context); +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional & format_settings, + ReadBufferCreator read_buffer_creator, + ContextPtr context); + +/// If ReadBuffer is created, it will be written to buf_out. +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional & format_settings, + ReadBufferCreator read_buffer_creator, + ContextPtr context, + std::unique_ptr & buf_out); /// Convert type to the most general type: /// - IntN, UIntN, FloatN, Decimal -> Float64 diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index a5746275b87..b7020ea128e 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -18,10 +18,10 @@ target_link_libraries(clickhouse_functions clickhouse_dictionaries clickhouse_dictionaries_embedded clickhouse_parsers - consistent-hashing + ch_contrib::consistent_hashing dbms - metrohash - murmurhash + ch_contrib::metrohash + ch_contrib::murmurhash PRIVATE ch_contrib::zlib @@ -76,6 +76,10 @@ endif() target_link_libraries(clickhouse_functions PRIVATE ch_contrib::lz4) +if (ENABLE_NLP) + target_link_libraries(clickhouse_functions PRIVATE ch_contrib::cld2) +endif() + if (TARGET ch_contrib::h3) target_link_libraries (clickhouse_functions PRIVATE ch_contrib::h3) endif() diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h index d9ca162ba16..fa10c004e87 100644 --- a/src/Functions/FunctionMathUnary.h +++ b/src/Functions/FunctionMathUnary.h @@ -43,16 +43,19 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - const auto & arg = arguments.front(); - if (!isNumber(arg)) - throw Exception{"Illegal type " + arg->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + const auto & argument = arguments.front(); + + if (!isNumber(argument)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}", + argument->getName(), + getName()); /// Integers are converted to Float64. - if (Impl::always_returns_float64 || !isFloat(arg)) + if (Impl::always_returns_float64 || !isFloat(argument)) return std::make_shared(); else - return arg; + return argument; } template @@ -122,7 +125,7 @@ private: { const auto & src_data = col->getData(); const size_t size = src_data.size(); - UInt32 scale = src_data.getScale(); + UInt32 scale = col->getScale(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -155,8 +158,10 @@ private: }; if (!callOnBasicType(col.type->getTypeId(), call)) - throw Exception{"Illegal column " + col.column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN}; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of argument of function {}", + col.column->getName(), + getName()); return res; } @@ -164,19 +169,17 @@ private: template -struct UnaryFunctionPlain +struct UnaryFunctionVectorized { static constexpr auto name = Name::name; static constexpr auto rows_per_iteration = 1; static constexpr bool always_returns_float64 = true; template - static void execute(const T * src, Float64 * dst) + static void execute(const T * __restrict src, Float64 * __restrict dst) { - dst[0] = static_cast(Function(static_cast(src[0]))); + *dst = Function(static_cast(*src)); } }; -#define UnaryFunctionVectorized UnaryFunctionPlain - } diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index 5248f524a2b..8c248d79c4b 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -18,6 +18,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int DECIMAL_OVERFLOW; + extern const int ILLEGAL_COLUMN; } /// Cast DateTime64 to Int64 representation narrowed down (or scaled up) to any scale value defined in Impl. @@ -108,8 +109,8 @@ public: if (arguments.size() < 1 || arguments.size() > 2) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", name); - if (!typeid_cast(arguments[0].type.get())) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be Int64", name); + if (!isInteger(arguments[0].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be integer", name); std::string timezone; if (arguments.size() == 2) @@ -118,21 +119,48 @@ public: return std::make_shared(target_scale, timezone); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + template + bool executeType(auto & result_column, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { const auto & src = arguments[0]; const auto & col = *src.column; - auto res_column = ColumnDecimal::create(input_rows_count, target_scale); - auto & result_data = res_column->getData(); + if (!checkAndGetColumn>(col)) + return 0; - const auto & source_data = typeid_cast(col).getData(); + auto & result_data = result_column->getData(); + + const auto & source_data = typeid_cast &>(col).getData(); for (size_t i = 0; i < input_rows_count; ++i) result_data[i] = source_data[i]; - return res_column; + return 1; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + auto result_column = ColumnDecimal::create(input_rows_count, target_scale); + + if (!((executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)) + || (executeType(result_column, arguments, input_rows_count)))) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), + getName()); + } + + return result_column; + } + }; } diff --git a/src/Functions/FunctionsBinaryRepr.cpp b/src/Functions/FunctionsBinaryRepr.cpp index 20b2acac88a..b8733cfc644 100644 --- a/src/Functions/FunctionsBinaryRepr.cpp +++ b/src/Functions/FunctionsBinaryRepr.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,7 @@ struct HexImpl static constexpr size_t word_size = 2; template - static void executeOneUInt(T x, char *& out) + static void executeOneUInt(T x, char *& out, bool skip_leading_zero = true, bool auto_close = true) { bool was_nonzero = false; for (int offset = (sizeof(T) - 1) * 8; offset >= 0; offset -= 8) @@ -50,15 +51,18 @@ struct HexImpl UInt8 byte = x >> offset; /// Skip leading zeros - if (byte == 0 && !was_nonzero && offset) //-V560 + if (byte == 0 && !was_nonzero && offset && skip_leading_zero) //-V560 continue; was_nonzero = true; writeHexByteUppercase(byte, out); out += word_size; } - *out = '\0'; - ++out; + if (auto_close) + { + *out = '\0'; + ++out; + } } static void executeOneString(const UInt8 * pos, const UInt8 * end, char *& out) @@ -130,7 +134,7 @@ struct BinImpl static constexpr size_t word_size = 8; template - static void executeOneUInt(T x, char *& out) + static void executeOneUInt(T x, char *& out, bool skip_leading_zero = true, bool auto_close = true) { bool was_nonzero = false; for (int offset = (sizeof(T) - 1) * 8; offset >= 0; offset -= 8) @@ -138,15 +142,18 @@ struct BinImpl UInt8 byte = x >> offset; /// Skip leading zeros - if (byte == 0 && !was_nonzero && offset) //-V560 + if (byte == 0 && !was_nonzero && offset && skip_leading_zero) //-V560 continue; was_nonzero = true; writeBinByte(byte, out); out += word_size; } - *out = '\0'; - ++out; + if (auto_close) + { + *out = '\0'; + ++out; + } } template @@ -275,6 +282,7 @@ public: !which.isUInt() && !which.isFloat() && !which.isDecimal() && + !which.isUUID() && !which.isAggregateFunction()) throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -306,7 +314,8 @@ public: tryExecuteFloat(column, res_column) || tryExecuteDecimal(column, res_column) || tryExecuteDecimal(column, res_column) || - tryExecuteDecimal(column, res_column)) + tryExecuteDecimal(column, res_column) || + tryExecuteUUID(column, res_column)) return res_column; throw Exception("Illegal column " + arguments[0].column->getName() @@ -480,6 +489,54 @@ public: return false; } } + + bool tryExecuteUUID(const IColumn * col, ColumnPtr & col_res) const + { + const ColumnUUID * col_vec = checkAndGetColumn(col); + + static constexpr size_t MAX_LENGTH = sizeof(UUID) * word_size + 1; /// Including trailing zero byte. + + if (col_vec) + { + auto col_str = ColumnString::create(); + ColumnString::Chars & out_vec = col_str->getChars(); + ColumnString::Offsets & out_offsets = col_str->getOffsets(); + + const typename ColumnUUID::Container & in_vec = col_vec->getData(); + const UUID* uuid = in_vec.data(); + + size_t size = in_vec.size(); + out_offsets.resize(size); + out_vec.resize(size * (word_size+1) + MAX_LENGTH); /// word_size+1 is length of one byte in hex/bin plus zero byte. + + size_t pos = 0; + for (size_t i = 0; i < size; ++i) + { + /// Manual exponential growth, so as not to rely on the linear amortized work time of `resize` (no one guarantees it). + if (pos + MAX_LENGTH > out_vec.size()) + out_vec.resize(out_vec.size() * word_size + MAX_LENGTH); + + char * begin = reinterpret_cast(&out_vec[pos]); + char * end = begin; + + // use executeOnUInt instead of using executeOneString + // because the latter one outputs the string in the memory order + Impl::executeOneUInt(uuid[i].toUnderType().items[0], end, false, false); + Impl::executeOneUInt(uuid[i].toUnderType().items[1], end, false, true); + + pos += end - begin; + out_offsets[i] = pos; + } + out_vec.resize(pos); + + col_res = std::move(col_str); + return true; + } + else + { + return false; + } + } }; /// Decode number or string from string with binary or hexadecimal representation diff --git a/src/Functions/FunctionsCharsetClassification.cpp b/src/Functions/FunctionsCharsetClassification.cpp new file mode 100644 index 00000000000..d29dc14fa9f --- /dev/null +++ b/src/Functions/FunctionsCharsetClassification.cpp @@ -0,0 +1,142 @@ +#include +#include +#include + +#include +#include + +namespace DB +{ + +/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes. + * Then we use marked-up dictionaries with distributions of bigram bytes of various languages ​​and charsets. + * Using a naive Bayesian classifier, find the most likely charset and language and return it + */ + +template +struct CharsetClassificationImpl +{ + /* We need to solve zero-frequency problem for Naive Bayes Classifier + * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06. + * 1e-06 is minimal value in our marked-up dictionary. + */ + static constexpr Float64 zero_frequency = 1e-06; + + /// If the data size is bigger than this, behaviour is unspecified for this function. + static constexpr size_t max_string_size = 1u << 15; + + static ALWAYS_INLINE inline Float64 naiveBayes( + const FrequencyHolder::EncodingMap & standard, + const HashMap & model, + Float64 max_result) + { + Float64 res = 0; + for (const auto & el : model) + { + /// Try to find bigram in the dictionary. + const auto * it = standard.find(el.getKey()); + if (it != standard.end()) + { + res += el.getMapped() * log(it->getMapped()); + } else + { + res += el.getMapped() * log(zero_frequency); + } + /// If at some step the result has become less than the current maximum, then it makes no sense to count it fully. + if (res < max_result) + { + return res; + } + } + return res; + } + + /// Сount how many times each bigram occurs in the text. + static ALWAYS_INLINE inline void calculateStats( + const UInt8 * data, + const size_t size, + HashMap & model) + { + UInt16 hash = 0; + for (size_t i = 0; i < size; ++i) + { + hash <<= 8; + hash += *(data + i); + ++model[hash]; + } + } + + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency(); + + if (detect_language) + /// 2 chars for ISO code + 1 zero byte + res_data.reserve(offsets.size() * 3); + else + /// Mean charset length is 8 + res_data.reserve(offsets.size() * 8); + + res_offsets.resize(offsets.size()); + + size_t res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + const UInt8 * str = data.data() + offsets[i - 1]; + const size_t str_len = offsets[i] - offsets[i - 1] - 1; + + std::string_view res; + + HashMap model; + calculateStats(str, str_len, model); + + /// Go through the dictionary and find the charset with the highest weight + Float64 max_result = log(zero_frequency) * (max_string_size); + for (const auto & item : encodings_freq) + { + Float64 score = naiveBayes(item.map, model, max_result); + if (max_result < score) + { + max_result = score; + res = detect_language ? item.lang : item.name; + } + } + + res_data.resize(res_offset + res.size() + 1); + memcpy(&res_data[res_offset], res.data(), res.size()); + + res_data[res_offset + res.size()] = 0; + res_offset += res.size() + 1; + + res_offsets[i] = res_offset; + } + } +}; + + +struct NameDetectCharset +{ + static constexpr auto name = "detectCharset"; +}; + +struct NameDetectLanguageUnknown +{ + static constexpr auto name = "detectLanguageUnknown"; +}; + + +using FunctionDetectCharset = FunctionTextClassificationString, NameDetectCharset>; +using FunctionDetectLanguageUnknown = FunctionTextClassificationString, NameDetectLanguageUnknown>; + +void registerFunctionDetectCharset(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index a8dda5e5eba..51f590d273a 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -156,9 +156,11 @@ struct ConvertImpl if (const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get())) { typename ColVecTo::MutablePtr col_to = nullptr; + if constexpr (IsDataTypeDecimal) { UInt32 scale; + if constexpr (std::is_same_v || std::is_same_v) { @@ -212,11 +214,11 @@ struct ConvertImpl bool convert_result = false; if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - convert_result = tryConvertDecimals(vec_from[i], vec_from.getScale(), vec_to.getScale(), result); + convert_result = tryConvertDecimals(vec_from[i], col_from->getScale(), col_to->getScale(), result); else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - convert_result = tryConvertFromDecimal(vec_from[i], vec_from.getScale(), result); + convert_result = tryConvertFromDecimal(vec_from[i], col_from->getScale(), result); else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - convert_result = tryConvertToDecimal(vec_from[i], vec_to.getScale(), result); + convert_result = tryConvertToDecimal(vec_from[i], col_to->getScale(), result); if (convert_result) vec_to[i] = result; @@ -229,11 +231,11 @@ struct ConvertImpl else { if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - vec_to[i] = convertDecimals(vec_from[i], vec_from.getScale(), vec_to.getScale()); + vec_to[i] = convertDecimals(vec_from[i], col_from->getScale(), col_to->getScale()); else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - vec_to[i] = convertFromDecimal(vec_from[i], vec_from.getScale()); + vec_to[i] = convertFromDecimal(vec_from[i], col_from->getScale()); else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - vec_to[i] = convertToDecimal(vec_from[i], vec_to.getScale()); + vec_to[i] = convertToDecimal(vec_from[i], col_to->getScale()); else throw Exception("Unsupported data type in conversion function", ErrorCodes::CANNOT_CONVERT_TYPE); } @@ -824,7 +826,7 @@ struct ConvertImpl) data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1)); else if constexpr (std::is_same_v) - data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + vec_from.getScale() + 1)); + data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1)); else data_to.resize(size * 3); /// Arbitrary @@ -1173,7 +1175,7 @@ struct ConvertThroughParsing if constexpr (to_datetime64) { DateTime64 res = 0; - parseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone); + parseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); vec_to[i] = res; } else @@ -1188,7 +1190,7 @@ struct ConvertThroughParsing if constexpr (to_datetime64) { DateTime64 res = 0; - parseDateTime64BestEffortUS(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone); + parseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); vec_to[i] = res; } else @@ -1203,12 +1205,12 @@ struct ConvertThroughParsing if constexpr (to_datetime64) { DateTime64 value = 0; - readDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone); + readDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); vec_to[i] = value; } else if constexpr (IsDataTypeDecimal) SerializationDecimal::readText( - vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); + vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); else { parseImpl(vec_to[i], read_buffer, local_time_zone); @@ -1227,7 +1229,7 @@ struct ConvertThroughParsing if constexpr (to_datetime64) { DateTime64 res = 0; - parsed = tryParseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone); + parsed = tryParseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); vec_to[i] = res; } else @@ -1248,12 +1250,12 @@ struct ConvertThroughParsing if constexpr (to_datetime64) { DateTime64 value = 0; - parsed = tryReadDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone); + parsed = tryReadDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); vec_to[i] = value; } else if constexpr (IsDataTypeDecimal) parsed = SerializationDecimal::tryReadText( - vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); + vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); else parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone); } @@ -1776,6 +1778,12 @@ private: } } + if constexpr (std::is_same_v) + { + if (from_type->getCustomSerialization()) + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + } + bool done; if constexpr (to_string_or_fixed_string) { @@ -2809,10 +2817,16 @@ private: } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); + const auto * from_type_map = checkAndGetDataType(from_type_untyped.get()); + + /// Convert from Map + if (from_type_map) + from_type = checkAndGetDataType(from_type_map->getNestedType().get()); + if (!from_type) { throw Exception(ErrorCodes::TYPE_MISMATCH, - "CAST AS Array can only be performed between same-dimensional Array or String types"); + "CAST AS Array can only be performed between same-dimensional Array, Map or String types"); } DataTypePtr from_nested_type = from_type->getNestedType(); @@ -2832,9 +2846,16 @@ private: return [nested_function, from_nested_type, to_nested_type]( ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr { - const auto & array_arg = arguments.front(); + const auto & argument_column = arguments.front(); - if (const ColumnArray * col_array = checkAndGetColumn(array_arg.column.get())) + const ColumnArray * col_array = nullptr; + + if (const ColumnMap * col_map = checkAndGetColumn(argument_column.column.get())) + col_array = &col_map->getNestedColumn(); + else + col_array = checkAndGetColumn(argument_column.column.get()); + + if (col_array) { /// create columns for converting nested column containing original and result columns ColumnsWithTypeAndName nested_columns{{ col_array->getDataPtr(), from_nested_type, "" }}; @@ -2846,7 +2867,11 @@ private: return ColumnArray::create(result_column, col_array->getOffsetsPtr()); } else - throw Exception{"Illegal column " + array_arg.column->getName() + " for function CAST AS Array", ErrorCodes::LOGICAL_ERROR}; + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Illegal column {} for function CAST AS Array", + argument_column.column->getName()); + } }; } @@ -3504,7 +3529,7 @@ private: return false; }; - auto make_custom_serialization_wrapper = [&](const auto & types) -> bool + auto make_custom_serialization_wrapper = [&](const auto & types) -> bool { using Types = std::decay_t; using ToDataType = typename Types::RightType; diff --git a/src/Functions/FunctionsLanguageClassification.cpp b/src/Functions/FunctionsLanguageClassification.cpp new file mode 100644 index 00000000000..521a4b0301e --- /dev/null +++ b/src/Functions/FunctionsLanguageClassification.cpp @@ -0,0 +1,231 @@ +#include "config_functions.h" + +#if USE_NLP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +/* Determine language of Unicode UTF-8 text. + * Uses the cld2 library https://github.com/CLD2Owners/cld2 + */ + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +extern const int SUPPORT_IS_DISABLED; +} + +struct FunctionDetectLanguageImpl +{ + static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string) + { + if (code_string.ends_with("-Latn")) + code_string.remove_suffix(code_string.size() - 5); + + if (code_string.ends_with("-Hant")) + code_string.remove_suffix(code_string.size() - 5); + + // Old deprecated codes + if (code_string == "iw") + return "he"; + + if (code_string == "jw") + return "jv"; + + if (code_string == "in") + return "id"; + + if (code_string == "mo") + return "ro"; + + // Some languages do not have 2 letter codes, for example code for Cebuano is ceb + if (code_string.size() != 2) + return "other"; + + return code_string; + } + + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + /// Constant 3 is based on the fact that in general we need 2 characters for ISO code + 1 zero byte + res_data.reserve(offsets.size() * 3); + res_offsets.resize(offsets.size()); + + bool is_reliable; + size_t res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + const UInt8 * str = data.data() + offsets[i - 1]; + const size_t str_len = offsets[i] - offsets[i - 1] - 1; + + std::string_view res; + + if (UTF8::isValidUTF8(str, str_len)) + { + auto lang = CLD2::DetectLanguage(reinterpret_cast(str), str_len, true, &is_reliable); + res = codeISO(LanguageCode(lang)); + } + else + { + res = "un"; + } + + res_data.resize(res_offset + res.size() + 1); + memcpy(&res_data[res_offset], res.data(), res.size()); + + res_data[res_offset + res.size()] = 0; + res_offset += res.size() + 1; + + res_offsets[i] = res_offset; + } + } +}; + +class FunctionDetectLanguageMixed : public IFunction +{ +public: + static constexpr auto name = "detectLanguageMixed"; + + /// Number of top results + static constexpr auto top_N = 3; + + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}. Must be String.", + arguments[0]->getName(), getName()); + + return std::make_shared(std::make_shared(), std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const auto & column = arguments[0].column; + const ColumnString * col = checkAndGetColumn(column.get()); + + if (!col) + throw Exception( + "Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + + const auto & input_data = col->getChars(); + const auto & input_offsets = col->getOffsets(); + + /// Create and fill the result map. + + const auto & result_type_map = static_cast(*result_type); + const DataTypePtr & key_type = result_type_map.getKeyType(); + const DataTypePtr & value_type = result_type_map.getValueType(); + + MutableColumnPtr keys_data = key_type->createColumn(); + MutableColumnPtr values_data = value_type->createColumn(); + MutableColumnPtr offsets = DataTypeNumber().createColumn(); + + size_t total_elements = input_rows_count * top_N; + keys_data->reserve(total_elements); + values_data->reserve(total_elements); + offsets->reserve(input_rows_count); + + bool is_reliable; + CLD2::Language result_lang_top3[top_N]; + int32_t pc[top_N]; + int bytes[top_N]; + + IColumn::Offset current_offset = 0; + for (size_t i = 0; i < input_rows_count; ++i) + { + const UInt8 * str = input_data.data() + input_offsets[i - 1]; + const size_t str_len = input_offsets[i] - input_offsets[i - 1] - 1; + + if (UTF8::isValidUTF8(str, str_len)) + { + CLD2::DetectLanguageSummary(reinterpret_cast(str), str_len, true, result_lang_top3, pc, bytes, &is_reliable); + + for (size_t j = 0; j < top_N; ++j) + { + if (pc[j] == 0) + break; + + auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j])); + Float32 res_float = static_cast(pc[j]) / 100; + + keys_data->insertData(res_str.data(), res_str.size()); + values_data->insertData(reinterpret_cast(&res_float), sizeof(res_float)); + ++current_offset; + } + } + else + { + std::string_view res_str = "un"; + Float32 res_float = 0; + + keys_data->insertData(res_str.data(), res_str.size()); + values_data->insertData(reinterpret_cast(&res_float), sizeof(res_float)); + ++current_offset; + } + offsets->insert(current_offset); + } + + auto nested_column = ColumnArray::create( + ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}), + std::move(offsets)); + + return ColumnMap::create(nested_column); + } +}; + +struct NameDetectLanguage +{ + static constexpr auto name = "detectLanguage"; +}; + + +using FunctionDetectLanguage = FunctionTextClassificationString; + +void registerFunctionsDetectLanguage(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} +#endif diff --git a/src/Functions/FunctionsProgrammingClassification.cpp b/src/Functions/FunctionsProgrammingClassification.cpp new file mode 100644 index 00000000000..2c95e22f239 --- /dev/null +++ b/src/Functions/FunctionsProgrammingClassification.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +/** + * Determine the programming language from the source code. + * We calculate all the unigrams and bigrams of commands in the source code. + * Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages + * Find the biggest weight of the programming language and return it + */ +struct FunctionDetectProgrammingLanguageImpl +{ + /// Calculate total weight + static ALWAYS_INLINE inline Float64 stateMachine( + const FrequencyHolder::Map & standard, + const std::unordered_map & model) + { + Float64 res = 0; + for (const auto & el : model) + { + /// Try to find each n-gram in dictionary + const auto * it = standard.find(el.first); + if (it != standard.end()) + res += el.second * it->getMapped(); + } + return res; + } + + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + const auto & programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency(); + + /// Constant 5 is arbitrary + res_data.reserve(offsets.size() * 5); + res_offsets.resize(offsets.size()); + + size_t res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + const UInt8 * str = data.data() + offsets[i - 1]; + const size_t str_len = offsets[i] - offsets[i - 1] - 1; + + std::unordered_map data_freq; + StringRef prev_command; + StringRef command; + + /// Select all commands from the string + for (size_t ind = 0; ind < str_len; ++ind) + { + /// Assume that all commands are split by spaces + if (isWhitespaceASCII(str[ind])) + continue; + + size_t prev_ind = ind; + while (ind < str_len && !isWhitespaceASCII(str[ind])) + ++ind; + + command = {str + prev_ind, ind - prev_ind}; + + /// We add both unigrams and bigrams to later search for them in the dictionary + if (prev_command.data) + data_freq[prev_command.toString() + command.toString()] += 1; + + data_freq[command.toString()] += 1; + prev_command = command; + } + + std::string_view res; + Float64 max_result = 0; + /// Iterate over all programming languages ​​and find the language with the highest weight + for (const auto & item : programming_freq) + { + Float64 result = stateMachine(item.map, data_freq); + if (result > max_result) + { + max_result = result; + res = item.name; + } + } + /// If all weights are zero, then we assume that the language is undefined + if (res.empty()) + res = "Undefined"; + + res_data.resize(res_offset + res.size() + 1); + memcpy(&res_data[res_offset], res.data(), res.size()); + + res_data[res_offset + res.size()] = 0; + res_offset += res.size() + 1; + + res_offsets[i] = res_offset; + } + } +}; + +struct NameDetectProgrammingLanguage +{ + static constexpr auto name = "detectProgrammingLanguage"; +}; + + +using FunctionDetectProgrammingLanguage = FunctionTextClassificationString; + +void registerFunctionDetectProgrammingLanguage(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 6a4691482ea..1d885b8c42c 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -422,9 +422,9 @@ private: using Container = typename ColumnDecimal::Container; public: - static NO_INLINE void apply(const Container & in, Container & out, Scale scale_arg) + static NO_INLINE void apply(const Container & in, UInt32 in_scale, Container & out, Scale scale_arg) { - scale_arg = in.getScale() - scale_arg; + scale_arg = in_scale - scale_arg; if (scale_arg > 0) { size_t scale = intExp10(scale_arg); @@ -498,11 +498,11 @@ public: const auto * const col = checkAndGetColumn>(col_general); const typename ColumnDecimal::Container & vec_src = col->getData(); - auto col_res = ColumnDecimal::create(vec_src.size(), vec_src.getScale()); + auto col_res = ColumnDecimal::create(vec_src.size(), col->getScale()); auto & vec_res = col_res->getData(); if (!vec_res.empty()) - DecimalRoundingImpl::apply(col->getData(), vec_res, scale_arg); + DecimalRoundingImpl::apply(col->getData(), col->getScale(), vec_res, scale_arg); return col_res; } diff --git a/src/Functions/FunctionsTextClassification.h b/src/Functions/FunctionsTextClassification.h new file mode 100644 index 00000000000..5fc26678cd3 --- /dev/null +++ b/src/Functions/FunctionsTextClassification.h @@ -0,0 +1,122 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +/// Functions for text classification with different result types + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +extern const int SUPPORT_IS_DISABLED; +} + +template +class FunctionTextClassificationString : public IFunction +{ +public: + static constexpr auto name = Name::name; + + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}. Must be String.", + arguments[0]->getName(), getName()); + + return arguments[0]; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override + { + const ColumnPtr & column = arguments[0].column; + const ColumnString * col = checkAndGetColumn(column.get()); + + if (!col) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + auto col_res = ColumnString::create(); + Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); + return col_res; + } +}; + +template +class FunctionTextClassificationFloat : public IFunction +{ +public: + static constexpr auto name = Name::name; + + static FunctionPtr create(ContextPtr context) + { + if (!context->getSettingsRef().allow_experimental_nlp_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name); + + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}. Must be String.", + arguments[0]->getName(), getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override + { + const ColumnPtr & column = arguments[0].column; + const ColumnString * col = checkAndGetColumn(column.get()); + + if (!col) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + auto col_res = ColumnVector::create(); + ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(col->size()); + + Impl::vector(col->getChars(), col->getOffsets(), vec_res); + return col_res; + } +}; + +} diff --git a/src/Functions/FunctionsTonalityClassification.cpp b/src/Functions/FunctionsTonalityClassification.cpp new file mode 100644 index 00000000000..5dbd6d0356d --- /dev/null +++ b/src/Functions/FunctionsTonalityClassification.cpp @@ -0,0 +1,89 @@ +#include +#include +#include +#include + +#include + +namespace DB +{ + +/** + * Determines the sentiment of text data. + * Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6. + * For each text, calculate the average sentiment value of its words and return it in range [-1,1] + */ +struct FunctionDetectTonalityImpl +{ + static ALWAYS_INLINE inline Float32 detectTonality( + const UInt8 * str, + const size_t str_len, + const FrequencyHolder::Map & emotional_dict) + { + Float64 weight = 0; + UInt64 count_words = 0; + + String word; + /// Select all Russian words from the string + for (size_t ind = 0; ind < str_len; ++ind) + { + /// Split words by whitespaces and punctuation signs + if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])) + continue; + + while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))) + { + word.push_back(str[ind]); + ++ind; + } + /// Try to find a russian word in the tonality dictionary + const auto * it = emotional_dict.find(word); + if (it != emotional_dict.end()) + { + count_words += 1; + weight += it->getMapped(); + } + word.clear(); + } + + if (!count_words) + return 0; + + /// Calculate average value of tonality. + /// Convert values -12..6 to -1..1 + if (weight > 0) + return weight / count_words / 6; + else + return weight / count_words / 12; + } + + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + PaddedPODArray & res) + { + const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict(); + + size_t size = offsets.size(); + size_t prev_offset = 0; + for (size_t i = 0; i < size; ++i) + { + res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict); + prev_offset = offsets[i]; + } + } +}; + +struct NameDetectTonality +{ + static constexpr auto name = "detectTonality"; +}; + +using FunctionDetectTonality = FunctionTextClassificationFloat; + +void registerFunctionDetectTonality(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/array/arrayAggregation.cpp b/src/Functions/array/arrayAggregation.cpp index da2304e1bb6..ee08c4f7f37 100644 --- a/src/Functions/array/arrayAggregation.cpp +++ b/src/Functions/array/arrayAggregation.cpp @@ -157,11 +157,11 @@ struct ArrayAggregateImpl return false; const AggregationType x = column_const->template getValue(); // NOLINT - const auto & data = checkAndGetColumn(&column_const->getDataColumn())->getData(); + const ColVecType * column_typed = checkAndGetColumn(&column_const->getDataColumn()); typename ColVecResultType::MutablePtr res_column; if constexpr (is_decimal) - res_column = ColVecResultType::create(offsets.size(), data.getScale()); + res_column = ColVecResultType::create(offsets.size(), column_typed->getScale()); else res_column = ColVecResultType::create(offsets.size()); @@ -185,7 +185,7 @@ struct ArrayAggregateImpl { if constexpr (is_decimal) { - res[i] = DecimalUtils::convertTo(x, data.getScale()); + res[i] = DecimalUtils::convertTo(x, column_typed->getScale()); } else { @@ -210,11 +210,11 @@ struct ArrayAggregateImpl throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow"); } - auto result_scale = data.getScale() * array_size; + auto result_scale = column_typed->getScale() * array_size; if (unlikely(result_scale > DecimalUtils::max_precision)) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is out of bounds", result_scale); - res[i] = DecimalUtils::convertTo(product, data.getScale() * array_size); + res[i] = DecimalUtils::convertTo(product, result_scale); } else { @@ -236,7 +236,7 @@ struct ArrayAggregateImpl typename ColVecResultType::MutablePtr res_column; if constexpr (is_decimal) - res_column = ColVecResultType::create(offsets.size(), data.getScale()); + res_column = ColVecResultType::create(offsets.size(), column->getScale()); else res_column = ColVecResultType::create(offsets.size()); @@ -309,7 +309,7 @@ struct ArrayAggregateImpl if constexpr (is_decimal) { aggregate_value = aggregate_value / AggregationType(count); - res[i] = DecimalUtils::convertTo(aggregate_value, data.getScale()); + res[i] = DecimalUtils::convertTo(aggregate_value, column->getScale()); } else { @@ -318,7 +318,7 @@ struct ArrayAggregateImpl } else if constexpr (aggregate_operation == AggregateOperation::product && is_decimal) { - auto result_scale = data.getScale() * count; + auto result_scale = column->getScale() * count; if (unlikely(result_scale > DecimalUtils::max_precision)) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is out of bounds", result_scale); diff --git a/src/Functions/array/arrayCompact.cpp b/src/Functions/array/arrayCompact.cpp index 7914b9a154e..c2908e37e12 100644 --- a/src/Functions/array/arrayCompact.cpp +++ b/src/Functions/array/arrayCompact.cpp @@ -40,7 +40,7 @@ struct ArrayCompactImpl typename ColVecType::MutablePtr res_values_column; if constexpr (is_decimal) - res_values_column = ColVecType::create(src_values.size(), src_values.getScale()); + res_values_column = ColVecType::create(src_values.size(), src_values_column->getScale()); else res_values_column = ColVecType::create(src_values.size()); diff --git a/src/Functions/array/arrayCumSum.cpp b/src/Functions/array/arrayCumSum.cpp index da8ef3d7852..467d9ad3951 100644 --- a/src/Functions/array/arrayCumSum.cpp +++ b/src/Functions/array/arrayCumSum.cpp @@ -101,9 +101,8 @@ struct ArrayCumSumImpl typename ColVecResult::MutablePtr res_nested; if constexpr (is_decimal) { - const typename ColVecType::Container & data = - checkAndGetColumn(&column_const->getDataColumn())->getData(); - res_nested = ColVecResult::create(0, data.getScale()); + const ColVecType * column_typed = checkAndGetColumn(&column_const->getDataColumn()); + res_nested = ColVecResult::create(0, column_typed->getScale()); } else res_nested = ColVecResult::create(); @@ -120,7 +119,7 @@ struct ArrayCumSumImpl typename ColVecResult::MutablePtr res_nested; if constexpr (is_decimal) - res_nested = ColVecResult::create(0, data.getScale()); + res_nested = ColVecResult::create(0, column->getScale()); else res_nested = ColVecResult::create(); diff --git a/src/Functions/array/arrayCumSumNonNegative.cpp b/src/Functions/array/arrayCumSumNonNegative.cpp index c40df27c1cc..476bbd08163 100644 --- a/src/Functions/array/arrayCumSumNonNegative.cpp +++ b/src/Functions/array/arrayCumSumNonNegative.cpp @@ -83,7 +83,7 @@ struct ArrayCumSumNonNegativeImpl typename ColVecResult::MutablePtr res_nested; if constexpr (is_decimal) - res_nested = ColVecResult::create(0, data.getScale()); + res_nested = ColVecResult::create(0, column->getScale()); else res_nested = ColVecResult::create(); diff --git a/src/Functions/array/arrayDifference.cpp b/src/Functions/array/arrayDifference.cpp index 97243f2cf74..c5fdf27100b 100644 --- a/src/Functions/array/arrayDifference.cpp +++ b/src/Functions/array/arrayDifference.cpp @@ -105,7 +105,7 @@ struct ArrayDifferenceImpl typename ColVecResult::MutablePtr res_nested; if constexpr (is_decimal) - res_nested = ColVecResult::create(0, data.getScale()); + res_nested = ColVecResult::create(0, column->getScale()); else res_nested = ColVecResult::create(); diff --git a/src/Functions/config_functions.h.in b/src/Functions/config_functions.h.in index 89274bcbfa3..a693611f975 100644 --- a/src/Functions/config_functions.h.in +++ b/src/Functions/config_functions.h.in @@ -8,4 +8,5 @@ #cmakedefine01 USE_H3 #cmakedefine01 USE_S2_GEOMETRY #cmakedefine01 USE_FASTOPS +#cmakedefine01 USE_NLP #cmakedefine01 USE_HYPERSCAN diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp index c89a7f80dfd..eef9bc3955b 100644 --- a/src/Functions/dateName.cpp +++ b/src/Functions/dateName.cpp @@ -148,7 +148,7 @@ public: UInt32 scale [[maybe_unused]] = 0; if constexpr (std::is_same_v) { - scale = times_data.getScale(); + scale = times->getScale(); } auto result_column = ColumnString::create(); diff --git a/src/Functions/degrees.cpp b/src/Functions/degrees.cpp index 481be7c7beb..543b7ac7315 100644 --- a/src/Functions/degrees.cpp +++ b/src/Functions/degrees.cpp @@ -12,8 +12,7 @@ namespace static constexpr auto name = "degrees"; }; - template - Float64 degrees(T r) + Float64 degrees(Float64 r) { Float64 degrees = r * (180 / M_PI); return degrees; diff --git a/src/Functions/divide/CMakeLists.txt b/src/Functions/divide/CMakeLists.txt index e5a10f0817c..03c2faaf3cf 100644 --- a/src/Functions/divide/CMakeLists.txt +++ b/src/Functions/divide/CMakeLists.txt @@ -3,17 +3,17 @@ if (ARCH_AMD64) add_library(divide_impl_sse2 divideImpl.cpp) target_compile_options(divide_impl_sse2 PRIVATE -msse2 -DNAMESPACE=SSE2) - target_link_libraries(divide_impl_sse2 libdivide) + target_link_libraries(divide_impl_sse2 ch_contrib::libdivide) add_library(divide_impl_avx2 divideImpl.cpp) target_compile_options(divide_impl_avx2 PRIVATE -mavx2 -DNAMESPACE=AVX2) - target_link_libraries(divide_impl_avx2 libdivide) + target_link_libraries(divide_impl_avx2 ch_contrib::libdivide) set(IMPLEMENTATIONS divide_impl_sse2 divide_impl_avx2) else () add_library(divide_impl_generic divideImpl.cpp) target_compile_options(divide_impl_generic PRIVATE -DNAMESPACE=Generic) - target_link_libraries(divide_impl_generic libdivide) + target_link_libraries(divide_impl_generic ch_contrib::libdivide) set(IMPLEMENTATIONS divide_impl_generic) endif () diff --git a/src/Functions/erf.cpp b/src/Functions/erf.cpp index 9c7ca637736..cb9428a3dbd 100644 --- a/src/Functions/erf.cpp +++ b/src/Functions/erf.cpp @@ -7,7 +7,7 @@ namespace { struct ErfName { static constexpr auto name = "erf"; }; -using FunctionErf = FunctionMathUnary>; +using FunctionErf = FunctionMathUnary>; } diff --git a/src/Functions/erfc.cpp b/src/Functions/erfc.cpp index 8e0b462ff8e..48707ff26d6 100644 --- a/src/Functions/erfc.cpp +++ b/src/Functions/erfc.cpp @@ -7,7 +7,7 @@ namespace { struct ErfcName { static constexpr auto name = "erfc"; }; -using FunctionErfc = FunctionMathUnary>; +using FunctionErfc = FunctionMathUnary>; } diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index 9f303b86ad3..e2ec90f4e61 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -440,7 +440,7 @@ public: UInt32 scale [[maybe_unused]] = 0; if constexpr (std::is_same_v) { - scale = vec.getScale(); + scale = times->getScale(); } auto col_res = ColumnString::create(); diff --git a/src/Functions/h3EdgeAngle.cpp b/src/Functions/h3EdgeAngle.cpp index 5d5ad6cd1d3..a65fde285f1 100644 --- a/src/Functions/h3EdgeAngle.cpp +++ b/src/Functions/h3EdgeAngle.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -20,6 +19,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ARGUMENT_OUT_OF_BOUND; + extern const int ILLEGAL_COLUMN; } namespace @@ -52,7 +52,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -60,12 +69,14 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const int resolution = col_hindex->getUInt(row); + const UInt8 resolution = data[row]; if (resolution > MAX_H3_RES) throw Exception( ErrorCodes::ARGUMENT_OUT_OF_BOUND, "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", - resolution, getName(), MAX_H3_RES); + toString(resolution), + getName(), + MAX_H3_RES); // Numerical constant is 180 degrees / pi / Earth radius, Earth radius is from h3 sources Float64 res = 8.99320592271288084e-6 * getHexagonEdgeLengthAvgM(resolution); diff --git a/src/Functions/h3EdgeLengthKm.cpp b/src/Functions/h3EdgeLengthKm.cpp new file mode 100644 index 00000000000..0cc485e93b1 --- /dev/null +++ b/src/Functions/h3EdgeLengthKm.cpp @@ -0,0 +1,98 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ARGUMENT_OUT_OF_BOUND; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3EdgeLengthKm : public IFunction +{ +public: + static constexpr auto name = "h3EdgeLengthKm"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt8()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt8 resolution = data[row]; + if (resolution > MAX_H3_RES) + throw Exception( + ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", + toString(resolution), + getName(), + MAX_H3_RES); + Float64 res = getHexagonEdgeLengthAvgKm(resolution); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3EdgeLengthKm(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3EdgeLengthM.cpp b/src/Functions/h3EdgeLengthM.cpp index 3eef9be9345..08b78517be9 100644 --- a/src/Functions/h3EdgeLengthM.cpp +++ b/src/Functions/h3EdgeLengthM.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ARGUMENT_OUT_OF_BOUND; + extern const int ILLEGAL_COLUMN; } namespace @@ -57,7 +58,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arguments[0].column->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -65,12 +75,12 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 resolution = col_hindex->getUInt(row); + const UInt8 resolution = data[row]; if (resolution > MAX_H3_RES) throw Exception( ErrorCodes::ARGUMENT_OUT_OF_BOUND, "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", - resolution, getName(), MAX_H3_RES); + toString(resolution), getName(), MAX_H3_RES); Float64 res = getHexagonEdgeLengthAvgM(resolution); diff --git a/src/Functions/h3ExactEdgeLengthKm.cpp b/src/Functions/h3ExactEdgeLengthKm.cpp new file mode 100644 index 00000000000..7aa9e573bed --- /dev/null +++ b/src/Functions/h3ExactEdgeLengthKm.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3ExactEdgeLengthKm : public IFunction +{ +public: + static constexpr auto name = "h3ExactEdgeLengthKm"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 index = data[row]; + Float64 res = exactEdgeLengthKm(index); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3ExactEdgeLengthKm(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3ExactEdgeLengthM.cpp b/src/Functions/h3ExactEdgeLengthM.cpp new file mode 100644 index 00000000000..5b7cb91e427 --- /dev/null +++ b/src/Functions/h3ExactEdgeLengthM.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3ExactEdgeLengthM : public IFunction +{ +public: + static constexpr auto name = "h3ExactEdgeLengthM"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 index = data[row]; + Float64 res = exactEdgeLengthM(index); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3ExactEdgeLengthM(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3ExactEdgeLengthRads.cpp b/src/Functions/h3ExactEdgeLengthRads.cpp new file mode 100644 index 00000000000..d2b9345c989 --- /dev/null +++ b/src/Functions/h3ExactEdgeLengthRads.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3ExactEdgeLengthRads : public IFunction +{ +public: + static constexpr auto name = "h3ExactEdgeLengthRads"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 index = data[row]; + Float64 res = exactEdgeLengthRads(index); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3ExactEdgeLengthRads(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3GetBaseCell.cpp b/src/Functions/h3GetBaseCell.cpp index 83978919f2c..7865f454815 100644 --- a/src/Functions/h3GetBaseCell.cpp +++ b/src/Functions/h3GetBaseCell.cpp @@ -17,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -49,7 +50,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -57,7 +67,7 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex = col_hindex->getUInt(row); + const UInt64 hindex = data[row]; UInt8 res = getBaseCellNumber(hindex); diff --git a/src/Functions/h3GetResolution.cpp b/src/Functions/h3GetResolution.cpp index 02b634dac89..7cc7dab8916 100644 --- a/src/Functions/h3GetResolution.cpp +++ b/src/Functions/h3GetResolution.cpp @@ -17,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -49,7 +50,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -57,7 +67,7 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex = col_hindex->getUInt(row); + const UInt64 hindex = data[row]; UInt8 res = getResolution(hindex); diff --git a/src/Functions/h3HexAreaKm2.cpp b/src/Functions/h3HexAreaKm2.cpp index 933fcf21424..74b74a351a1 100644 --- a/src/Functions/h3HexAreaKm2.cpp +++ b/src/Functions/h3HexAreaKm2.cpp @@ -70,12 +70,12 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 resolution = data[row]; + const UInt8 resolution = data[row]; if (resolution > MAX_H3_RES) throw Exception( ErrorCodes::ARGUMENT_OUT_OF_BOUND, "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", - resolution, + toString(resolution), getName(), MAX_H3_RES); diff --git a/src/Functions/h3HexAreaM2.cpp b/src/Functions/h3HexAreaM2.cpp index 96b301806a5..ff68d01bf68 100644 --- a/src/Functions/h3HexAreaM2.cpp +++ b/src/Functions/h3HexAreaM2.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ARGUMENT_OUT_OF_BOUND; + extern const int ILLEGAL_COLUMN; } namespace @@ -52,7 +53,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arguments[0].column->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -60,12 +70,14 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 resolution = col_hindex->getUInt(row); + const UInt8 resolution = data[row]; if (resolution > MAX_H3_RES) throw Exception( ErrorCodes::ARGUMENT_OUT_OF_BOUND, "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", - resolution, getName(), MAX_H3_RES); + toString(resolution), + getName(), + MAX_H3_RES); Float64 res = getHexagonAreaAvgM2(resolution); diff --git a/src/Functions/h3IndexesAreNeighbors.cpp b/src/Functions/h3IndexesAreNeighbors.cpp index 27eaacad4d6..97b8461fc5a 100644 --- a/src/Functions/h3IndexesAreNeighbors.cpp +++ b/src/Functions/h3IndexesAreNeighbors.cpp @@ -17,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -56,8 +57,27 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex_origin = arguments[0].column.get(); - const auto * col_hindex_dest = arguments[1].column.get(); + const auto * col_hindex_origin = checkAndGetColumn(arguments[0].column.get()); + if (!col_hindex_origin) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data_hindex_origin = col_hindex_origin->getData(); + + const auto * col_hindex_dest = checkAndGetColumn(arguments[1].column.get()); + if (!col_hindex_dest) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[1].type->getName(), + 2, + getName()); + + const auto & data_hindex_dest = col_hindex_dest->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -65,8 +85,8 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex_origin = col_hindex_origin->getUInt(row); - const UInt64 hindex_dest = col_hindex_dest->getUInt(row); + const UInt64 hindex_origin = data_hindex_origin[row]; + const UInt64 hindex_dest = data_hindex_dest[row]; UInt8 res = areNeighborCells(hindex_origin, hindex_dest); diff --git a/src/Functions/h3IsValid.cpp b/src/Functions/h3IsValid.cpp index aa109eee6b4..7c97e77250c 100644 --- a/src/Functions/h3IsValid.cpp +++ b/src/Functions/h3IsValid.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include @@ -17,6 +16,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -49,7 +49,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -57,7 +66,7 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex = col_hindex->getUInt(row); + const UInt64 hindex = data[row]; UInt8 is_valid = isValidCell(hindex) == 0 ? 0 : 1; diff --git a/src/Functions/h3NumHexagons.cpp b/src/Functions/h3NumHexagons.cpp new file mode 100644 index 00000000000..4336e441cfb --- /dev/null +++ b/src/Functions/h3NumHexagons.cpp @@ -0,0 +1,95 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +extern const int ARGUMENT_OUT_OF_BOUND; +} + +namespace +{ + +class FunctionH3NumHexagons : public IFunction +{ +public: + static constexpr auto name = "h3NumHexagons"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt8()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt8 resolution = data[row]; + if (resolution > MAX_H3_RES) + throw Exception( + ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", + toString(resolution), getName(), MAX_H3_RES); + Int64 res = getNumCells(resolution); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3NumHexagons(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3ToChildren.cpp b/src/Functions/h3ToChildren.cpp index fcd7465e79f..a825255495d 100644 --- a/src/Functions/h3ToChildren.cpp +++ b/src/Functions/h3ToChildren.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ARGUMENT_OUT_OF_BOUND; extern const int TOO_LARGE_ARRAY_SIZE; + extern const int ILLEGAL_COLUMN; } namespace @@ -65,8 +66,28 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); - const auto * col_resolution = arguments[1].column.get(); + const auto * col_hindex = checkAndGetColumn(arguments[0].column.get()); + if (!col_hindex) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data_hindex = col_hindex->getData(); + + const auto * col_resolution = checkAndGetColumn(arguments[1].column.get()); + if (!col_resolution) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8.", + arguments[1].type->getName(), + 2, + getName()); + + const auto & data_resolution = col_resolution->getData(); + auto dst = ColumnArray::create(ColumnUInt64::create()); auto & dst_data = dst->getData(); @@ -76,8 +97,8 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 parent_hindex = col_hindex->getUInt(row); - const UInt8 child_resolution = col_resolution->getUInt(row); + const UInt64 parent_hindex = data_hindex[row]; + const UInt8 child_resolution = data_resolution[row]; if (child_resolution > MAX_H3_RES) throw Exception( diff --git a/src/Functions/h3ToGeoBoundary.cpp b/src/Functions/h3ToGeoBoundary.cpp index e1e34d1eef3..2c892fb59ae 100644 --- a/src/Functions/h3ToGeoBoundary.cpp +++ b/src/Functions/h3ToGeoBoundary.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int INCORRECT_DATA; + extern const int ILLEGAL_COLUMN; } class FunctionH3ToGeoBoundary : public IFunction @@ -51,7 +52,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto latitude = ColumnFloat64::create(); auto longitude = ColumnFloat64::create(); @@ -61,7 +71,7 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - H3Index h3index = col_hindex->getUInt(row); + H3Index h3index = data[row]; CellBoundary boundary{}; auto err = cellToBoundary(h3index, &boundary); diff --git a/src/Functions/h3ToParent.cpp b/src/Functions/h3ToParent.cpp index fef1b16696f..b2262d3ac22 100644 --- a/src/Functions/h3ToParent.cpp +++ b/src/Functions/h3ToParent.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ARGUMENT_OUT_OF_BOUND; + extern const int ILLEGAL_COLUMN; } namespace @@ -59,8 +60,27 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); - const auto * col_resolution = arguments[1].column.get(); + const auto * col_hindex = checkAndGetColumn(arguments[0].column.get()); + if (!col_hindex) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data_hindex = col_hindex->getData(); + + const auto * col_resolution = checkAndGetColumn(arguments[1].column.get()); + if (!col_resolution) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8.", + arguments[1].type->getName(), + 2, + getName()); + + const auto & data_resolution = col_resolution->getData(); auto dst = ColumnVector::create(); auto & dst_data = dst->getData(); @@ -68,14 +88,16 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex = col_hindex->getUInt(row); - const UInt8 resolution = col_resolution->getUInt(row); + const UInt64 hindex = data_hindex[row]; + const UInt8 resolution = data_resolution[row]; if (resolution > MAX_H3_RES) throw Exception( ErrorCodes::ARGUMENT_OUT_OF_BOUND, "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is {}", - toString(resolution), getName(), toString(MAX_H3_RES)); + toString(resolution), + getName(), + toString(MAX_H3_RES)); UInt64 res = cellToParent(hindex, resolution); diff --git a/src/Functions/h3ToString.cpp b/src/Functions/h3ToString.cpp index 25d82d2d40c..0152dd69b6a 100644 --- a/src/Functions/h3ToString.cpp +++ b/src/Functions/h3ToString.cpp @@ -3,6 +3,7 @@ #if USE_H3 #include +#include #include #include #include @@ -17,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -51,7 +53,17 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + auto col_res = ColumnString::create(); auto & vec_res = col_res->getChars(); @@ -63,9 +75,9 @@ public: char * begin = reinterpret_cast(vec_res.data()); char * pos = begin; - for (size_t i = 0; i < input_rows_count; ++i) + for (size_t row = 0; row < input_rows_count; ++row) { - const UInt64 hindex = col_hindex->getUInt(i); + const UInt64 hindex = data[row]; if (!isValidCell(hindex)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Invalid H3 index: {}", hindex); @@ -76,7 +88,7 @@ public: while (*pos != '\0') pos++; - vec_offsets[i] = ++pos - begin; + vec_offsets[row] = ++pos - begin; } vec_res.resize(pos - begin); return col_res; diff --git a/src/Functions/h3kRing.cpp b/src/Functions/h3kRing.cpp index 8b073cc266d..baa74b6698b 100644 --- a/src/Functions/h3kRing.cpp +++ b/src/Functions/h3kRing.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include @@ -23,6 +22,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int PARAMETER_OUT_OF_BOUND; + extern const int ILLEGAL_COLUMN; } namespace @@ -51,19 +51,41 @@ public: arg->getName(), 1, getName()); arg = arguments[1].get(); - if (!isInteger(arg)) + if (!WhichDataType(arg).isUInt16()) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument {} of function {}. Must be integer", - arg->getName(), 2, getName()); + "Illegal type {} of argument {} of function {}. Must be UInt16", + arg->getName(), + 2, + getName()); return std::make_shared(std::make_shared()); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_hindex = arguments[0].column.get(); - const auto * col_k = arguments[1].column.get(); + const auto * col_hindex = checkAndGetColumn(arguments[0].column.get()); + if (!col_hindex) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data_hindex = col_hindex->getData(); + + /// ColumnUInt16 is sufficient as the max value of 2nd arg is checked (arg > 0 < 10000) in implementation below + const auto * col_k = checkAndGetColumn(arguments[1].column.get()); + if (!col_k) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt16.", + arguments[1].type->getName(), + 2, + getName()); + + const auto & data_k = col_k->getData(); auto dst = ColumnArray::create(ColumnUInt64::create()); auto & dst_data = dst->getData(); @@ -73,8 +95,8 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - const H3Index origin_hindex = col_hindex->getUInt(row); - const int k = col_k->getInt(row); + const H3Index origin_hindex = data_hindex[row]; + const int k = data_k[row]; /// Overflow is possible. The function maxGridDiskSize does not check for overflow. /// The calculation is similar to square of k but several times more. @@ -82,6 +104,7 @@ public: constexpr auto max_k = 10000; if (k > max_k) throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Too large 'k' argument for {} function, maximum {}", getName(), max_k); + /// Check is already made while fetching the argument for k (to determine if it's an unsigned integer). Nevertheless, it's checked again here. if (k < 0) throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Argument 'k' for {} function must be non negative", getName()); diff --git a/src/Functions/h3toGeo.cpp b/src/Functions/h3toGeo.cpp index d92ac2aadbb..403bcc3c3d6 100644 --- a/src/Functions/h3toGeo.cpp +++ b/src/Functions/h3toGeo.cpp @@ -21,6 +21,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; } namespace @@ -58,7 +59,16 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * col_index = arguments[0].column.get(); + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); auto latitude = ColumnFloat64::create(input_rows_count); auto longitude = ColumnFloat64::create(input_rows_count); @@ -69,7 +79,7 @@ public: for (size_t row = 0; row < input_rows_count; ++row) { - H3Index h3index = col_index->getUInt(row); + H3Index h3index = data[row]; LatLng coord{}; cellToLatLng(h3index,&coord); diff --git a/src/Functions/lgamma.cpp b/src/Functions/lgamma.cpp index e4da0d8dfbd..57ab74f8f87 100644 --- a/src/Functions/lgamma.cpp +++ b/src/Functions/lgamma.cpp @@ -21,7 +21,7 @@ Float64 lgamma_wrapper(Float64 arg) } struct LGammaName { static constexpr auto name = "lgamma"; }; -using FunctionLGamma = FunctionMathUnary>; +using FunctionLGamma = FunctionMathUnary>; } diff --git a/src/Functions/radians.cpp b/src/Functions/radians.cpp index bf7a1fe3394..4dffdc08547 100644 --- a/src/Functions/radians.cpp +++ b/src/Functions/radians.cpp @@ -12,8 +12,7 @@ namespace static constexpr auto name = "radians"; }; - template - Float64 radians(T d) + Float64 radians(Float64 d) { Float64 radians = d * (M_PI / 180); return radians; diff --git a/src/Functions/registerFunctionsGeo.cpp b/src/Functions/registerFunctionsGeo.cpp index 33b15d91e60..0501b603c57 100644 --- a/src/Functions/registerFunctionsGeo.cpp +++ b/src/Functions/registerFunctionsGeo.cpp @@ -30,6 +30,10 @@ void registerFunctionH3ToGeo(FunctionFactory &); void registerFunctionH3ToGeoBoundary(FunctionFactory &); void registerFunctionH3EdgeAngle(FunctionFactory &); void registerFunctionH3EdgeLengthM(FunctionFactory &); +void registerFunctionH3EdgeLengthKm(FunctionFactory &); +void registerFunctionH3ExactEdgeLengthM(FunctionFactory &); +void registerFunctionH3ExactEdgeLengthKm(FunctionFactory &); +void registerFunctionH3ExactEdgeLengthRads(FunctionFactory &); void registerFunctionH3GetResolution(FunctionFactory &); void registerFunctionH3IsValid(FunctionFactory &); void registerFunctionH3KRing(FunctionFactory &); @@ -47,6 +51,7 @@ void registerFunctionH3GetFaces(FunctionFactory &); void registerFunctionH3HexAreaKm2(FunctionFactory &); void registerFunctionH3CellAreaM2(FunctionFactory &); void registerFunctionH3CellAreaRads2(FunctionFactory &); +void registerFunctionH3NumHexagons(FunctionFactory &); #endif @@ -91,6 +96,10 @@ void registerFunctionsGeo(FunctionFactory & factory) registerFunctionH3ToGeoBoundary(factory); registerFunctionH3EdgeAngle(factory); registerFunctionH3EdgeLengthM(factory); + registerFunctionH3EdgeLengthKm(factory); + registerFunctionH3ExactEdgeLengthM(factory); + registerFunctionH3ExactEdgeLengthKm(factory); + registerFunctionH3ExactEdgeLengthRads(factory); registerFunctionH3GetResolution(factory); registerFunctionH3IsValid(factory); registerFunctionH3KRing(factory); @@ -108,6 +117,7 @@ void registerFunctionsGeo(FunctionFactory & factory) registerFunctionH3HexAreaKm2(factory); registerFunctionH3CellAreaM2(factory); registerFunctionH3CellAreaRads2(factory); + registerFunctionH3NumHexagons(factory); #endif #if USE_S2_GEOMETRY diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 7d1673aff7c..f86043c6959 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -39,6 +39,9 @@ void registerFunctionEncodeXMLComponent(FunctionFactory &); void registerFunctionDecodeXMLComponent(FunctionFactory &); void registerFunctionExtractTextFromHTML(FunctionFactory &); void registerFunctionToStringCutToZero(FunctionFactory &); +void registerFunctionDetectCharset(FunctionFactory &); +void registerFunctionDetectTonality(FunctionFactory &); +void registerFunctionDetectProgrammingLanguage(FunctionFactory &); #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); @@ -50,6 +53,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &); void registerFunctionStem(FunctionFactory &); void registerFunctionSynonyms(FunctionFactory &); void registerFunctionLemmatize(FunctionFactory &); +void registerFunctionsDetectLanguage(FunctionFactory &); #endif #if USE_ICU @@ -91,6 +95,9 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionDecodeXMLComponent(factory); registerFunctionExtractTextFromHTML(factory); registerFunctionToStringCutToZero(factory); + registerFunctionDetectCharset(factory); + registerFunctionDetectTonality(factory); + registerFunctionDetectProgrammingLanguage(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); @@ -102,6 +109,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionStem(factory); registerFunctionSynonyms(factory); registerFunctionLemmatize(factory); + registerFunctionsDetectLanguage(factory); #endif #if USE_ICU diff --git a/src/Functions/tgamma.cpp b/src/Functions/tgamma.cpp index 8ad00bc79ff..3378d44388d 100644 --- a/src/Functions/tgamma.cpp +++ b/src/Functions/tgamma.cpp @@ -7,7 +7,7 @@ namespace { struct TGammaName { static constexpr auto name = "tgamma"; }; -using FunctionTGamma = FunctionMathUnary>; +using FunctionTGamma = FunctionMathUnary>; } diff --git a/src/IO/Lz4DeflatingWriteBuffer.cpp b/src/IO/Lz4DeflatingWriteBuffer.cpp index da954b13df9..c2eb175d5a9 100644 --- a/src/IO/Lz4DeflatingWriteBuffer.cpp +++ b/src/IO/Lz4DeflatingWriteBuffer.cpp @@ -26,7 +26,7 @@ Lz4DeflatingWriteBuffer::Lz4DeflatingWriteBuffer( 0 /* no dictID */, LZ4F_noBlockChecksum}, compression_level, /* compression level; 0 == default */ - 0, /* autoflush */ + 1, /* autoflush */ 0, /* favor decompression speed */ {0, 0, 0}, /* reserved, must be set to 0 */ }; @@ -125,6 +125,8 @@ void Lz4DeflatingWriteBuffer::nextImpl() out->position() = out->buffer().begin(); throw; } + out->next(); + out_capacity = out->buffer().end() - out->position(); } void Lz4DeflatingWriteBuffer::finalizeBefore() diff --git a/src/IO/Lz4InflatingReadBuffer.cpp b/src/IO/Lz4InflatingReadBuffer.cpp index 61e912d440c..5639badbe0e 100644 --- a/src/IO/Lz4InflatingReadBuffer.cpp +++ b/src/IO/Lz4InflatingReadBuffer.cpp @@ -70,6 +70,12 @@ bool Lz4InflatingReadBuffer::nextImpl() return !working_buffer.empty(); } + /// It may happen that we didn't get new uncompressed data + /// (for example if we read the end of frame). Load new data + /// in this case. + if (working_buffer.empty()) + return nextImpl(); + return true; } } diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index be456ea398c..b6927ffcf0e 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -63,7 +63,10 @@ public: if (!res) working_buffer = Buffer(pos, pos); else + { pos = working_buffer.begin() + nextimpl_working_buffer_offset; + assert(position() != working_buffer.end()); + } nextimpl_working_buffer_offset = 0; assert(position() <= working_buffer.end()); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 8a60f9fc42e..9eb584619e9 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/src/IO/examples/CMakeLists.txt b/src/IO/examples/CMakeLists.txt index d79aaa48d83..8c9723a78fb 100644 --- a/src/IO/examples/CMakeLists.txt +++ b/src/IO/examples/CMakeLists.txt @@ -68,7 +68,7 @@ add_executable (zlib_ng_bug zlib_ng_bug.cpp) target_link_libraries (zlib_ng_bug PRIVATE ch_contrib::zlib) add_executable (dragonbox_test dragonbox_test.cpp) -target_link_libraries (dragonbox_test PRIVATE dragonbox_to_chars) +target_link_libraries (dragonbox_test PRIVATE ch_contrib::dragonbox_to_chars) add_executable (zstd_buffers zstd_buffers.cpp) target_link_libraries (zstd_buffers PRIVATE clickhouse_common_io) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index d1c5fbebbc7..72e49505b54 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -109,6 +109,23 @@ void AsynchronousMetrics::openSensors() else break; } + + file->rewind(); + Int64 temperature = 0; + try + { + readText(temperature, *file); + } + catch (const ErrnoException & e) + { + LOG_WARNING( + &Poco::Logger::get("AsynchronousMetrics"), + "Thermal monitor '{}' exists but could not be read, error {}.", + thermal_device_index, + e.getErrno()); + continue; + } + thermal.emplace_back(std::move(file)); } } @@ -222,6 +239,23 @@ void AsynchronousMetrics::openSensorsChips() std::replace(sensor_name.begin(), sensor_name.end(), ' ', '_'); } + file->rewind(); + Int64 temperature = 0; + try + { + readText(temperature, *file); + } + catch (const ErrnoException & e) + { + LOG_WARNING( + &Poco::Logger::get("AsynchronousMetrics"), + "Hardware monitor '{}', sensor '{}' exists but could not be read, error {}.", + hwmon_name, + sensor_name, + e.getErrno()); + continue; + } + hwmon_devices[hwmon_name][sensor_name] = std::move(file); } } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index 0888d3645be..2b92fab15de 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -38,13 +38,11 @@ SelectStreamFactory::SelectStreamFactory( const Block & header_, const ColumnsDescriptionByShardNum & objects_by_shard_, const StorageSnapshotPtr & storage_snapshot_, - QueryProcessingStage::Enum processed_stage_, - bool has_virtual_shard_num_column_) + QueryProcessingStage::Enum processed_stage_) : header(header_), objects_by_shard(objects_by_shard_), storage_snapshot(storage_snapshot_), - processed_stage(processed_stage_), - has_virtual_shard_num_column(has_virtual_shard_num_column_) + processed_stage(processed_stage_) { } @@ -107,23 +105,19 @@ void SelectStreamFactory::createForShard( Shards & remote_shards, UInt32 shard_count) { - auto modified_query_ast = query_ast->clone(); - if (has_virtual_shard_num_column) - VirtualColumnUtils::rewriteEntityInAst(modified_query_ast, "_shard_num", shard_info.shard_num, "toUInt32"); - auto it = objects_by_shard.find(shard_info.shard_num); if (it != objects_by_shard.end()) - replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, modified_query_ast); + replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); auto emplace_local_stream = [&]() { - local_plans.emplace_back(createLocalPlan(modified_query_ast, header, context, processed_stage, shard_info.shard_num, shard_count)); + local_plans.emplace_back(createLocalPlan(query_ast, header, context, processed_stage, shard_info.shard_num, shard_count)); }; auto emplace_remote_stream = [&](bool lazy = false, UInt32 local_delay = 0) { remote_shards.emplace_back(Shard{ - .query = modified_query_ast, + .query = query_ast, .header = header, .shard_num = shard_info.shard_num, .num_replicas = shard_info.getAllNodeCount(), diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 42fa27649bf..731bf3acd10 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -21,8 +21,7 @@ public: const Block & header_, const ColumnsDescriptionByShardNum & objects_by_shard_, const StorageSnapshotPtr & storage_snapshot_, - QueryProcessingStage::Enum processed_stage_, - bool has_virtual_shard_num_column_); + QueryProcessingStage::Enum processed_stage_); void createForShard( const Cluster::ShardInfo & shard_info, @@ -39,8 +38,6 @@ private: const ColumnsDescriptionByShardNum objects_by_shard; const StorageSnapshotPtr storage_snapshot; QueryProcessingStage::Enum processed_stage; - - bool has_virtual_shard_num_column = false; }; } diff --git a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp index 2117eec0063..a81d4204565 100644 --- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp +++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp @@ -68,7 +68,7 @@ void ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data) static bool worthConvertingToLiteral(const Block & scalar) { const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName(); - std::set useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"}; + static const std::set useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"}; return !useless_literal_types.count(scalar_type_name); } diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp index b2cd9495feb..aab3a9e7437 100644 --- a/src/Interpreters/ExternalLoader.cpp +++ b/src/Interpreters/ExternalLoader.cpp @@ -966,14 +966,14 @@ private: /// Does the loading, possibly in the separate thread. void doLoading(const String & name, size_t loading_id, bool forced_to_reload, size_t min_id_to_finish_loading_dependencies_, bool async, ThreadGroupStatusPtr thread_group = {}) { - if (thread_group) - CurrentThread::attachTo(thread_group); - SCOPE_EXIT_SAFE( if (thread_group) CurrentThread::detachQueryIfNotDetached(); ); + if (thread_group) + CurrentThread::attachTo(thread_group); + LOG_TRACE(log, "Start loading object '{}'", name); try { diff --git a/src/Interpreters/InterpreterCreateFunctionQuery.cpp b/src/Interpreters/InterpreterCreateFunctionQuery.cpp index 20114fa0d75..615fbb03403 100644 --- a/src/Interpreters/InterpreterCreateFunctionQuery.cpp +++ b/src/Interpreters/InterpreterCreateFunctionQuery.cpp @@ -60,19 +60,33 @@ void InterpreterCreateFunctionQuery::validateFunction(ASTPtr function, const Str auto & lambda_function = function->as(); auto & lambda_function_expression_list = lambda_function.arguments->children; - const auto & tuple_function_arguments = lambda_function_expression_list.at(0)->as(); + if (lambda_function_expression_list.size() != 2) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Lambda must have arguments and body"); + + const ASTFunction * tuple_function_arguments = lambda_function_expression_list[0]->as(); + + if (!tuple_function_arguments || !tuple_function_arguments->arguments) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Lambda must have valid arguments"); std::unordered_set arguments; - for (const auto & argument : tuple_function_arguments.arguments->children) + for (const auto & argument : tuple_function_arguments->arguments->children) { - const auto & argument_name = argument->as()->name(); + const auto * argument_identifier = argument->as(); + + if (!argument_identifier) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Lambda argument must be identifier"); + + const auto & argument_name = argument_identifier->name(); auto [_, inserted] = arguments.insert(argument_name); if (!inserted) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Identifier {} already used as function parameter", argument_name); } - ASTPtr function_body = lambda_function_expression_list.at(1); + ASTPtr function_body = lambda_function_expression_list[1]; + if (!function_body) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Lambda must have valid function body"); + validateFunctionRecursiveness(function_body, name); } diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 123ff6ba2ca..f2ac2565a7f 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -56,7 +56,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int TIMEOUT_EXCEEDED; extern const int TABLE_WAS_NOT_DROPPED; - extern const int NO_ZOOKEEPER; } @@ -142,12 +141,17 @@ void InterpreterSystemQuery::startStopAction(StorageActionBlockType action_type, auto manager = getContext()->getActionLocksManager(); manager->cleanExpired(); + auto access = getContext()->getAccess(); + auto required_access_type = getRequiredAccessType(action_type); + if (volume_ptr && action_type == ActionLocks::PartsMerge) { + access->checkAccess(required_access_type); volume_ptr->setAvoidMergesUserOverride(!start); } else if (table_id) { + access->checkAccess(required_access_type, table_id.database_name, table_id.table_name); auto table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); if (table) { @@ -162,7 +166,6 @@ void InterpreterSystemQuery::startStopAction(StorageActionBlockType action_type, } else { - auto access = getContext()->getAccess(); for (auto & elem : DatabaseCatalog::instance().getDatabases()) { for (auto iterator = elem.second->getTablesIterator(getContext()); iterator->isValid(); iterator->next()) @@ -171,14 +174,9 @@ void InterpreterSystemQuery::startStopAction(StorageActionBlockType action_type, if (!table) continue; - if (!access->isGranted(getRequiredAccessType(action_type), elem.first, iterator->name())) + if (!access->isGranted(required_access_type, elem.first, iterator->name())) { - LOG_INFO( - log, - "Access {} denied, skipping {}.{}", - toString(getRequiredAccessType(action_type)), - elem.first, - iterator->name()); + LOG_INFO(log, "Access {} denied, skipping {}.{}", toString(required_access_type), elem.first, iterator->name()); continue; } @@ -423,8 +421,7 @@ BlockIO InterpreterSystemQuery::execute() restartReplicas(system_context); break; case Type::RESTART_REPLICA: - if (!tryRestartReplica(table_id, system_context)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); + restartReplica(table_id, system_context); break; case Type::RESTORE_REPLICA: restoreReplica(); @@ -472,12 +469,6 @@ void InterpreterSystemQuery::restoreReplica() { getContext()->checkAccess(AccessType::SYSTEM_RESTORE_REPLICA, table_id); - const zkutil::ZooKeeperPtr & zookeeper = getContext()->getZooKeeper(); - - if (zookeeper->expired()) - throw Exception(ErrorCodes::NO_ZOOKEEPER, - "Cannot restore table metadata because ZooKeeper session has expired"); - const StoragePtr table_ptr = DatabaseCatalog::instance().getTable(table_id, getContext()); auto * const table_replicated_ptr = dynamic_cast(table_ptr.get()); @@ -485,30 +476,11 @@ void InterpreterSystemQuery::restoreReplica() if (table_replicated_ptr == nullptr) throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); - auto & table_replicated = *table_replicated_ptr; - - StorageReplicatedMergeTree::Status status; - table_replicated.getStatus(status); - - if (!status.is_readonly) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica must be readonly"); - - const String replica_name = table_replicated.getReplicaName(); - const String& zk_root_path = status.zookeeper_path; - - if (String replica_path = zk_root_path + "replicas/" + replica_name; zookeeper->exists(replica_path)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Replica path is present at {} -- nothing to restore. " - "If you are sure that metadata it lost and replica path contain some garbage, " - "then use SYSTEM DROP REPLICA query first.", replica_path); - - table_replicated.restoreMetadataInZooKeeper(); + table_replicated_ptr->restoreMetadataInZooKeeper(); } StoragePtr InterpreterSystemQuery::tryRestartReplica(const StorageID & replica, ContextMutablePtr system_context, bool need_ddl_guard) { - getContext()->checkAccess(AccessType::SYSTEM_RESTART_REPLICA, replica); - auto table_ddl_guard = need_ddl_guard ? DatabaseCatalog::instance().getDDLGuard(replica.getDatabaseName(), replica.getTableName()) : nullptr; @@ -553,15 +525,36 @@ StoragePtr InterpreterSystemQuery::tryRestartReplica(const StorageID & replica, return table; } +void InterpreterSystemQuery::restartReplica(const StorageID & replica, ContextMutablePtr system_context) +{ + getContext()->checkAccess(AccessType::SYSTEM_RESTART_REPLICA, replica); + if (!tryRestartReplica(replica, system_context)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), replica.getNameForLogs()); +} + void InterpreterSystemQuery::restartReplicas(ContextMutablePtr system_context) { std::vector replica_names; auto & catalog = DatabaseCatalog::instance(); + auto access = getContext()->getAccess(); + bool access_is_granted_globally = access->isGranted(AccessType::SYSTEM_RESTART_REPLICA); + for (auto & elem : catalog.getDatabases()) + { for (auto it = elem.second->getTablesIterator(getContext()); it->isValid(); it->next()) + { if (dynamic_cast(it->table().get())) + { + if (!access_is_granted_globally && !access->isGranted(AccessType::SYSTEM_RESTART_REPLICA, elem.first, it->name())) + { + LOG_INFO(log, "Access {} denied, skipping {}.{}", "SYSTEM RESTART REPLICA", elem.first, it->name()); + continue; + } replica_names.emplace_back(it->databaseName(), it->name()); + } + } + } if (replica_names.empty()) return; @@ -607,14 +600,22 @@ void InterpreterSystemQuery::dropReplica(ASTSystemQuery & query) } else if (query.is_drop_whole_replica) { - getContext()->checkAccess(AccessType::SYSTEM_DROP_REPLICA); auto databases = DatabaseCatalog::instance().getDatabases(); + auto access = getContext()->getAccess(); + bool access_is_granted_globally = access->isGranted(AccessType::SYSTEM_DROP_REPLICA); for (auto & elem : databases) { DatabasePtr & database = elem.second; for (auto iterator = database->getTablesIterator(getContext()); iterator->isValid(); iterator->next()) + { + if (!access_is_granted_globally && !access->isGranted(AccessType::SYSTEM_DROP_REPLICA, elem.first, iterator->name())) + { + LOG_INFO(log, "Access {} denied, skipping {}.{}", "SYSTEM DROP REPLICA", elem.first, iterator->name()); + continue; + } dropReplicaImpl(query, iterator->table()); + } LOG_TRACE(log, "Dropped replica {} from database {}", query.replica, backQuoteIfNeed(database->getDatabaseName())); } } diff --git a/src/Interpreters/InterpreterSystemQuery.h b/src/Interpreters/InterpreterSystemQuery.h index 6d1ad78a991..e8e16f18d00 100644 --- a/src/Interpreters/InterpreterSystemQuery.h +++ b/src/Interpreters/InterpreterSystemQuery.h @@ -47,6 +47,7 @@ private: /// Returns pointer to a newly created table if the restart was successful StoragePtr tryRestartReplica(const StorageID & replica, ContextMutablePtr context, bool need_ddl_guard = true); + void restartReplica(const StorageID & replica, ContextMutablePtr system_context); void restartReplicas(ContextMutablePtr system_context); void syncReplica(ASTSystemQuery & query); diff --git a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp index 9494c4133ff..df74a94ee57 100644 --- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp +++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp @@ -108,6 +108,9 @@ static NamesAndTypesList getColumnsList(const ASTExpressionList * columns_defini data_type_function->name = type_name_upper + " UNSIGNED"; } + if (type_name_upper == "SET") + data_type_function->arguments.reset(); + /// Transforms MySQL ENUM's list of strings to ClickHouse string-integer pairs /// For example ENUM('a', 'b', 'c') -> ENUM('a'=1, 'b'=2, 'c'=3) /// Elements on a position further than 32767 are assigned negative values, starting with -32768. diff --git a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp index efa0e921527..680b9bd5606 100644 --- a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp +++ b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp @@ -40,7 +40,8 @@ TEST(MySQLCreateRewritten, ColumnsDataType) {"TINYINT", "Int8"}, {"SMALLINT", "Int16"}, {"MEDIUMINT", "Int32"}, {"INT", "Int32"}, {"INTEGER", "Int32"}, {"BIGINT", "Int64"}, {"FLOAT", "Float32"}, {"DOUBLE", "Float64"}, {"VARCHAR(10)", "String"}, {"CHAR(10)", "String"}, {"Date", "Date"}, {"DateTime", "DateTime"}, - {"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"}, {"BIT", "UInt64"} + {"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"}, {"BIT", "UInt64"}, {"SET", "UInt64"}, + {"YEAR", "UInt16"}, {"TIME", "Int64"}, {"GEOMETRY", "String"} }; for (const auto & [test_type, mapped_type] : test_types) diff --git a/src/Interpreters/ReplaceQueryParameterVisitor.cpp b/src/Interpreters/ReplaceQueryParameterVisitor.cpp index 25051f68901..03de8aecc92 100644 --- a/src/Interpreters/ReplaceQueryParameterVisitor.cpp +++ b/src/Interpreters/ReplaceQueryParameterVisitor.cpp @@ -69,7 +69,14 @@ void ReplaceQueryParameterVisitor::visitQueryParameter(ASTPtr & ast) " because it isn't parsed completely: only {} of {} bytes was parsed: {}", value, type_name, ast_param.name, read_buffer.count(), value.size(), value.substr(0, read_buffer.count())); - ast = addTypeConversionToAST(std::make_shared(temp_column[0]), type_name); + Field literal; + /// If data type has custom serialization, we should use CAST from String, + /// because CAST from field may not work correctly (for example for type IPv6). + if (data_type->getCustomSerialization()) + literal = value; + else + literal = temp_column[0]; + ast = addTypeConversionToAST(std::make_shared(literal), type_name); /// Keep the original alias. ast->setAlias(alias); diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp index 7f725c1d8a5..50e6bec28f3 100644 --- a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp +++ b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp @@ -25,7 +25,8 @@ void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, auto lower_name = Poco::toLower(func.name); - if (lower_name != "sum" && lower_name != "sumif") + /// sumIf, SumIf or sUMIf are valid function names, but sumIF or sumiF are not + if (lower_name != "sum" && (lower_name != "sumif" || !endsWith(func.name, "If"))) return; const auto & func_arguments = func.arguments->children; diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 66e28678ce6..ec6fd98010d 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -32,15 +32,12 @@ #include -#define DBMS_SYSTEM_LOG_QUEUE_SIZE 1048576 - namespace DB { namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int TIMEOUT_EXCEEDED; extern const int LOGICAL_ERROR; } @@ -114,13 +111,12 @@ std::shared_ptr createSystemLog( return std::make_shared(context, database, table, engine, flush_interval_milliseconds); } -} - -/// -/// ISystemLog -/// -ASTPtr ISystemLog::getCreateTableQueryClean(const StorageID & table_id, ContextPtr context) +/// returns CREATE TABLE query, but with removed: +/// - UUID +/// - SETTINGS (for MergeTree) +/// That way it can be used to compare with the SystemLog::getCreateTableQuery() +ASTPtr getCreateTableQueryClean(const StorageID & table_id, ContextPtr context) { DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); ASTPtr old_ast = database->getCreateTableQuery(table_id.table_name, context); @@ -135,37 +131,8 @@ ASTPtr ISystemLog::getCreateTableQueryClean(const StorageID & table_id, ContextP return old_ast; } -void ISystemLog::stopFlushThread() -{ - { - std::lock_guard lock(mutex); - - if (!saving_thread.joinable()) - { - return; - } - - if (is_shutdown) - { - return; - } - - is_shutdown = true; - - /// Tell thread to shutdown. - flush_event.notify_all(); - } - - saving_thread.join(); } -void ISystemLog::startup() -{ - std::lock_guard lock(mutex); - saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); }); -} - - /// /// SystemLogs /// @@ -270,77 +237,6 @@ SystemLog::SystemLog( log = &Poco::Logger::get("SystemLog (" + database_name_ + "." + table_name_ + ")"); } - -static thread_local bool recursive_add_call = false; - -template -void SystemLog::add(const LogElement & element) -{ - /// It is possible that the method will be called recursively. - /// Better to drop these events to avoid complications. - if (recursive_add_call) - return; - recursive_add_call = true; - SCOPE_EXIT({ recursive_add_call = false; }); - - /// Memory can be allocated while resizing on queue.push_back. - /// The size of allocation can be in order of a few megabytes. - /// But this should not be accounted for query memory usage. - /// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky. - MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); - - /// Should not log messages under mutex. - bool queue_is_half_full = false; - - { - std::unique_lock lock(mutex); - - if (is_shutdown) - return; - - if (queue.size() == DBMS_SYSTEM_LOG_QUEUE_SIZE / 2) - { - queue_is_half_full = true; - - // The queue more than half full, time to flush. - // We only check for strict equality, because messages are added one - // by one, under exclusive lock, so we will see each message count. - // It is enough to only wake the flushing thread once, after the message - // count increases past half available size. - const uint64_t queue_end = queue_front_index + queue.size(); - if (requested_flush_up_to < queue_end) - requested_flush_up_to = queue_end; - - flush_event.notify_all(); - } - - if (queue.size() >= DBMS_SYSTEM_LOG_QUEUE_SIZE) - { - // Ignore all further entries until the queue is flushed. - // Log a message about that. Don't spam it -- this might be especially - // problematic in case of trace log. Remember what the front index of the - // queue was when we last logged the message. If it changed, it means the - // queue was flushed, and we can log again. - if (queue_front_index != logged_queue_full_at_index) - { - logged_queue_full_at_index = queue_front_index; - - // TextLog sets its logger level to 0, so this log is a noop and - // there is no recursive logging. - lock.unlock(); - LOG_ERROR(log, "Queue is full for system log '{}' at {}", demangle(typeid(*this).name()), queue_front_index); - } - - return; - } - - queue.push_back(element); - } - - if (queue_is_half_full) - LOG_INFO(log, "Queue is half full for system log '{}'.", demangle(typeid(*this).name())); -} - template void SystemLog::shutdown() { @@ -351,48 +247,6 @@ void SystemLog::shutdown() table->flushAndShutdown(); } -template -void SystemLog::flush(bool force) -{ - uint64_t this_thread_requested_offset; - - { - std::unique_lock lock(mutex); - - if (is_shutdown) - return; - - this_thread_requested_offset = queue_front_index + queue.size(); - - // Publish our flush request, taking care not to overwrite the requests - // made by other threads. - is_force_prepare_tables |= force; - requested_flush_up_to = std::max(requested_flush_up_to, - this_thread_requested_offset); - - flush_event.notify_all(); - } - - LOG_DEBUG(log, "Requested flush up to offset {}", - this_thread_requested_offset); - - // Use an arbitrary timeout to avoid endless waiting. 60s proved to be - // too fast for our parallel functional tests, probably because they - // heavily load the disk. - const int timeout_seconds = 180; - std::unique_lock lock(mutex); - bool result = flush_event.wait_for(lock, std::chrono::seconds(timeout_seconds), - [&] { return flushed_up_to >= this_thread_requested_offset - && !is_force_prepare_tables; }); - - if (!result) - { - throw Exception("Timeout exceeded (" + toString(timeout_seconds) + " s) while flushing system log '" + demangle(typeid(*this).name()) + "'.", - ErrorCodes::TIMEOUT_EXCEEDED); - } -} - - template void SystemLog::savingThreadFunction() { @@ -625,17 +479,7 @@ ASTPtr SystemLog::getCreateTableQuery() return create; } -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; -template class SystemLog; +#define INSTANTIATE_SYSTEM_LOG(ELEMENT) template class SystemLog; +SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG) } diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index 3209dd2e13e..6dc2e7a5582 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -1,34 +1,12 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include -#include -#include -#include -#include #include -#include -#include - - -namespace Poco -{ -class Logger; -namespace Util -{ -class AbstractConfiguration; -} -} namespace DB { - /** Allow to store structured log in system table. * * Logging is asynchronous. Data is put into queue from where it will be read by separate thread. @@ -66,44 +44,6 @@ class QueryViewsLog; class ZooKeeperLog; class SessionLog; - -class ISystemLog -{ -public: - virtual String getName() = 0; - //// force -- force table creation (used for SYSTEM FLUSH LOGS) - virtual void flush(bool force = false) = 0; - virtual void prepareTable() = 0; - - /// Start the background thread. - virtual void startup(); - - /// Stop the background flush thread before destructor. No more data will be written. - virtual void shutdown() = 0; - - virtual ~ISystemLog() = default; - - virtual void savingThreadFunction() = 0; - - /// returns CREATE TABLE query, but with removed: - /// - UUID - /// - SETTINGS (for MergeTree) - /// That way it can be used to compare with the SystemLog::getCreateTableQuery() - static ASTPtr getCreateTableQueryClean(const StorageID & table_id, ContextPtr context); - -protected: - ThreadFromGlobalPool saving_thread; - - /// Data shared between callers of add()/flush()/shutdown(), and the saving thread - std::mutex mutex; - - bool is_shutdown = false; - std::condition_variable flush_event; - - void stopFlushThread(); -}; - - /// System logs should be destroyed in destructor of the last Context and before tables, /// because SystemLog destruction makes insert query while flushing data into underlying tables struct SystemLogs @@ -136,10 +76,11 @@ struct SystemLogs template -class SystemLog : public ISystemLog, private boost::noncopyable, WithContext +class SystemLog : public SystemLogBase, private boost::noncopyable, WithContext { public: using Self = SystemLog; + using Base = SystemLogBase; /** Parameter: table name where to write log. * If table is not exists, then it get created with specified engine. @@ -156,27 +97,23 @@ public: const String & storage_def_, size_t flush_interval_milliseconds_); - /** Append a record into log. - * Writing to table will be done asynchronously and in case of failure, record could be lost. - */ - void add(const LogElement & element); - void shutdown() override; - /// Flush data in the buffer to disk - void flush(bool force) override; - - String getName() override - { - return LogElement::name(); - } - - ASTPtr getCreateTableQuery(); - protected: - Poco::Logger * log; + using ISystemLog::mutex; + using ISystemLog::is_shutdown; + using ISystemLog::flush_event; + using ISystemLog::stopFlushThread; + using Base::log; + using Base::queue; + using Base::queue_front_index; + using Base::is_force_prepare_tables; + using Base::requested_flush_up_to; + using Base::flushed_up_to; + using Base::logged_queue_full_at_index; private: + /* Saving thread data */ const StorageID table_id; const String storage_def; @@ -185,32 +122,17 @@ private: bool is_prepared = false; const size_t flush_interval_milliseconds; - // Queue is bounded. But its size is quite large to not block in all normal cases. - std::vector queue; - // An always-incrementing index of the first message currently in the queue. - // We use it to give a global sequential index to every message, so that we - // can wait until a particular message is flushed. This is used to implement - // synchronous log flushing for SYSTEM FLUSH LOGS. - uint64_t queue_front_index = 0; - // A flag that says we must create the tables even if the queue is empty. - bool is_force_prepare_tables = false; - // Requested to flush logs up to this index, exclusive - uint64_t requested_flush_up_to = 0; - // Flushed log up to this index, exclusive - uint64_t flushed_up_to = 0; - // Logged overflow message at this queue front index - uint64_t logged_queue_full_at_index = -1; - - void savingThreadFunction() override; - /** Creates new table if it does not exist. * Renames old table if its structure is not suitable. * This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created. */ void prepareTable() override; + void savingThreadFunction() override; + /// flushImpl can be executed only in saving_thread. void flushImpl(const std::vector & to_flush, uint64_t to_flush_end); + ASTPtr getCreateTableQuery(); }; } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 9619acf0a11..cde92103e34 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -792,6 +792,39 @@ void markTupleLiteralsAsLegacy(ASTPtr & query) MarkTupleLiteralsAsLegacyVisitor(data).visit(query); } +/// Rewrite _shard_num -> shardNum() AS _shard_num +struct RewriteShardNum +{ + struct Data + { + }; + + static bool needChildVisit(const ASTPtr & parent, const ASTPtr & /*child*/) + { + /// ON section should not be rewritten. + return typeid_cast(parent.get()) == nullptr; + } + + static void visit(ASTPtr & ast, Data &) + { + if (auto * identifier = typeid_cast(ast.get())) + visit(*identifier, ast); + } + + static void visit(ASTIdentifier & identifier, ASTPtr & ast) + { + if (identifier.shortName() != "_shard_num") + return; + + String alias = identifier.tryGetAlias(); + if (alias.empty()) + alias = "_shard_num"; + ast = makeASTFunction("shardNum"); + ast->setAlias(alias); + } +}; +using RewriteShardNumVisitor = InDepthNodeVisitor; + } TreeRewriterResult::TreeRewriterResult( @@ -961,6 +994,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select ++it; } + has_virtual_shard_num = false; /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add /// in columns list, so that when further processing they are also considered. if (storage) @@ -977,6 +1011,18 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select else ++it; } + + if (is_remote_storage) + { + for (const auto & name_type : storage_virtuals) + { + if (name_type.name == "_shard_num" && storage->isVirtualColumn("_shard_num", storage_snapshot->getMetadataForQuery())) + { + has_virtual_shard_num = true; + break; + } + } + } } if (!unknown_required_source_columns.empty()) @@ -1164,6 +1210,13 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( } } + /// Rewrite _shard_num to shardNum() + if (result.has_virtual_shard_num) + { + RewriteShardNumVisitor::Data data_rewrite_shard_num; + RewriteShardNumVisitor(data_rewrite_shard_num).visit(query); + } + result.ast_join = select_query->join(); if (result.optimize_trivial_count) diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 7e3e9be68a2..92068207a81 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -72,6 +72,9 @@ struct TreeRewriterResult /// Cache isRemote() call for storage, because it may be too heavy. bool is_remote_storage = false; + /// Rewrite _shard_num to shardNum() + bool has_virtual_shard_num = false; + /// Results of scalar sub queries Scalars scalars; diff --git a/src/Interpreters/examples/CMakeLists.txt b/src/Interpreters/examples/CMakeLists.txt index 58b9ae3ee6f..0c0bcb88f7a 100644 --- a/src/Interpreters/examples/CMakeLists.txt +++ b/src/Interpreters/examples/CMakeLists.txt @@ -5,7 +5,7 @@ add_executable (hash_map_lookup hash_map_lookup.cpp) target_link_libraries (hash_map_lookup PRIVATE dbms) add_executable (hash_map3 hash_map3.cpp) -target_link_libraries (hash_map3 PRIVATE dbms ch_contrib::farmhash metrohash) +target_link_libraries (hash_map3 PRIVATE dbms ch_contrib::farmhash ch_contrib::metrohash) add_executable (hash_map_string hash_map_string.cpp) target_link_libraries (hash_map_string PRIVATE dbms ch_contrib::sparsehash) @@ -14,7 +14,7 @@ add_executable (hash_map_string_2 hash_map_string_2.cpp) target_link_libraries (hash_map_string_2 PRIVATE dbms) add_executable (hash_map_string_3 hash_map_string_3.cpp) -target_link_libraries (hash_map_string_3 PRIVATE dbms ch_contrib::farmhash metrohash) +target_link_libraries (hash_map_string_3 PRIVATE dbms ch_contrib::farmhash ch_contrib::metrohash) add_executable (hash_map_string_small hash_map_string_small.cpp) target_link_libraries (hash_map_string_small PRIVATE dbms ch_contrib::sparsehash) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 870e01d3b5c..f40d35e970b 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -61,6 +61,7 @@ #include #include +#include #include @@ -659,7 +660,7 @@ static std::tuple executeQueryImpl( if (context->query_trace_context.trace_id != UUID()) { auto * raw_interpreter_ptr = interpreter.get(); - std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr)); + std::string class_name(demangle(typeid(*raw_interpreter_ptr).name())); span = std::make_unique(class_name + "::execute()"); } res = interpreter->execute(); diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp index edf911fa61c..c8a2d0903f2 100644 --- a/src/Interpreters/sortBlock.cpp +++ b/src/Interpreters/sortBlock.cpp @@ -1,13 +1,10 @@ #include -#include #include #include -#include -#include +#include #include -#include namespace DB { @@ -17,66 +14,34 @@ namespace ErrorCodes extern const int BAD_COLLATION; } -static bool isCollationRequired(const SortColumnDescription & description) +/// Column with description for sort +struct ColumnWithSortDescription +{ + const IColumn * column = nullptr; + SortColumnDescription description; + + /// It means, that this column is ColumnConst + bool column_const = false; +}; + +using ColumnsWithSortDescriptions = std::vector; + +namespace +{ + +inline bool isCollationRequired(const SortColumnDescription & description) { return description.collator != nullptr; } - -ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description) -{ - size_t size = description.size(); - ColumnsWithSortDescriptions res; - res.reserve(size); - - for (size_t i = 0; i < size; ++i) - { - const IColumn * column = !description[i].column_name.empty() - ? block.getByName(description[i].column_name).column.get() - : block.safeGetByPosition(description[i].column_number).column.get(); - - res.emplace_back(ColumnWithSortDescription{column, description[i], isColumnConst(*column)}); - } - - return res; -} - - -struct PartialSortingLess +template +struct PartialSortingLessImpl { const ColumnsWithSortDescriptions & columns; - explicit PartialSortingLess(const ColumnsWithSortDescriptions & columns_) : columns(columns_) {} + explicit PartialSortingLessImpl(const ColumnsWithSortDescriptions & columns_) : columns(columns_) { } - bool operator() (size_t a, size_t b) const - { - for (const auto & elem : columns) - { - int res; - if (elem.column_const) - res = 0; - else - res = elem.description.direction * elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction); - if (res < 0) - return true; - else if (res > 0) - return false; - } - return false; - } -}; - - -struct PartialSortingLessWithCollation -{ - const ColumnsWithSortDescriptions & columns; - - explicit PartialSortingLessWithCollation(const ColumnsWithSortDescriptions & columns_) - : columns(columns_) - { - } - - bool operator() (size_t a, size_t b) const + inline bool operator()(size_t a, size_t b) const { for (const auto & elem : columns) { @@ -85,13 +50,25 @@ struct PartialSortingLessWithCollation if (elem.column_const) { res = 0; + continue; } - else if (isCollationRequired(elem.description)) + + if constexpr (check_collation) { - res = elem.column->compareAtWithCollation(a, b, *elem.column, elem.description.nulls_direction, *elem.description.collator); + if (isCollationRequired(elem.description)) + { + res = elem.column->compareAtWithCollation(a, b, *elem.column, elem.description.nulls_direction, *elem.description.collator); + } + else + { + res = elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction); + } } else + { res = elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction); + } + res *= elem.description.direction; if (res < 0) return true; @@ -102,124 +79,148 @@ struct PartialSortingLessWithCollation } }; +using PartialSortingLess = PartialSortingLessImpl; +using PartialSortingLessWithCollation = PartialSortingLessImpl; + +} + +void convertTupleColumnIntoSortDescriptions( + const ColumnTuple * tuple, const SortColumnDescription & description, ColumnsWithSortDescriptions & result) +{ + for (const auto & column : tuple->getColumns()) + { + if (const auto * subtuple = typeid_cast(column.get())) + { + convertTupleColumnIntoSortDescriptions(subtuple, description, result); + } + else + { + result.emplace_back(ColumnWithSortDescription{column.get(), description, isColumnConst(*column)}); + + if (isCollationRequired(description) && !result.back().column->isCollationSupported()) + result.back().description.collator = nullptr; + } + } +} + +ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description) +{ + size_t size = description.size(); + + ColumnsWithSortDescriptions result; + result.reserve(size); + + for (size_t i = 0; i < size; ++i) + { + const auto & sort_column_description = description[i]; + + const IColumn * column = !sort_column_description.column_name.empty() + ? block.getByName(sort_column_description.column_name).column.get() + : block.safeGetByPosition(sort_column_description.column_number).column.get(); + + if (isCollationRequired(sort_column_description)) + { + if (!column->isCollationSupported()) + throw Exception( + "Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, " + "containing them.", + ErrorCodes::BAD_COLLATION); + } + + if (const auto * tuple = typeid_cast(column)) + convertTupleColumnIntoSortDescriptions(tuple, sort_column_description, result); + else + result.emplace_back(ColumnWithSortDescription{column, sort_column_description, isColumnConst(*column)}); + } + + return result; +} + void sortBlock(Block & block, const SortDescription & description, UInt64 limit) { if (!block) return; - /// If only one column to sort by - if (description.size() == 1) + ColumnsWithSortDescriptions columns_with_sort_descriptions = getColumnsWithSortDescription(block, description); + + bool all_const = true; + for (const auto & column : columns_with_sort_descriptions) { - IColumn::Permutation perm; - bool reverse = description[0].direction == -1; - - const IColumn * column = !description[0].column_name.empty() - ? block.getByName(description[0].column_name).column.get() - : block.safeGetByPosition(description[0].column_number).column.get(); - - bool is_column_const = false; - if (isCollationRequired(description[0])) + if (!column.column_const) { - if (!column->isCollationSupported()) - throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION); + all_const = false; + break; + } + } + if (all_const) + return; - if (isColumnConst(*column)) - is_column_const = true; - else - column->getPermutationWithCollation(*description[0].collator, reverse, limit, description[0].nulls_direction, perm); - } - else if (!isColumnConst(*column)) - { - int nan_direction_hint = description[0].nulls_direction; - column->getPermutation(reverse, limit, nan_direction_hint, perm); - } + IColumn::Permutation permutation; + + /// If only one column to sort by + if (columns_with_sort_descriptions.size() == 1) + { + auto & column_with_sort_description = columns_with_sort_descriptions[0]; + + bool reverse = column_with_sort_description.description.direction == -1; + int nan_direction_hint = column_with_sort_description.description.nulls_direction; + const auto & column = column_with_sort_description.column; + + if (isCollationRequired(column_with_sort_description.description)) + column->getPermutationWithCollation( + *column_with_sort_description.description.collator, reverse, limit, nan_direction_hint, permutation); else - /// we don't need to do anything with const column - is_column_const = true; - - size_t columns = block.columns(); - for (size_t i = 0; i < columns; ++i) - { - if (!is_column_const) - block.getByPosition(i).column = block.getByPosition(i).column->permute(perm, limit); - } + column->getPermutation(reverse, limit, nan_direction_hint, permutation); } else { size_t size = block.rows(); - IColumn::Permutation perm(size); + permutation.resize(size); for (size_t i = 0; i < size; ++i) - perm[i] = i; + permutation[i] = i; if (limit >= size) limit = 0; - bool need_collation = false; - ColumnsWithSortDescriptions columns_with_sort_desc = getColumnsWithSortDescription(block, description); + EqualRanges ranges; + ranges.emplace_back(0, permutation.size()); - for (size_t i = 0, num_sort_columns = description.size(); i < num_sort_columns; ++i) + for (const auto & column_with_sort_description : columns_with_sort_descriptions) { - const IColumn * column = columns_with_sort_desc[i].column; - if (isCollationRequired(description[i])) - { - if (!column->isCollationSupported()) - throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION); + while (!ranges.empty() && limit && limit <= ranges.back().first) + ranges.pop_back(); - need_collation = true; + if (ranges.empty()) + break; + + if (column_with_sort_description.column_const) + continue; + + bool is_collation_required = isCollationRequired(column_with_sort_description.description); + bool reverse = column_with_sort_description.description.direction < 0; + int nan_direction_hint = column_with_sort_description.description.nulls_direction; + const auto & column = column_with_sort_description.column; + + if (is_collation_required) + { + column->updatePermutationWithCollation( + *column_with_sort_description.description.collator, reverse, limit, nan_direction_hint, permutation, ranges); + } + else + { + column->updatePermutation(reverse, limit, nan_direction_hint, permutation, ranges); } } + } - if (need_collation) - { - EqualRanges ranges; - ranges.emplace_back(0, perm.size()); - for (const auto & column : columns_with_sort_desc) - { - while (!ranges.empty() && limit && limit <= ranges.back().first) - ranges.pop_back(); - - if (ranges.empty()) - break; - - if (column.column_const) - continue; - - if (isCollationRequired(column.description)) - { - column.column->updatePermutationWithCollation( - *column.description.collator, column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges); - } - else - { - column.column->updatePermutation( - column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges); - } - } - } - else - { - EqualRanges ranges; - ranges.emplace_back(0, perm.size()); - for (const auto & column : columns_with_sort_desc) - { - while (!ranges.empty() && limit && limit <= ranges.back().first) - ranges.pop_back(); - - if (ranges.empty()) - break; - - column.column->updatePermutation( - column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges); - } - } - - size_t columns = block.columns(); - for (size_t i = 0; i < columns; ++i) - block.getByPosition(i).column = block.getByPosition(i).column->permute(perm, limit); + size_t columns = block.columns(); + for (size_t i = 0; i < columns; ++i) + { + auto & column_to_sort = block.getByPosition(i).column; + column_to_sort = column_to_sort->permute(permutation, limit); } } - void stableGetPermutation(const Block & block, const SortDescription & description, IColumn::Permutation & out_permutation) { if (!block) @@ -235,7 +236,6 @@ void stableGetPermutation(const Block & block, const SortDescription & descripti std::stable_sort(out_permutation.begin(), out_permutation.end(), PartialSortingLess(columns_with_sort_desc)); } - bool isAlreadySorted(const Block & block, const SortDescription & description) { if (!block) @@ -276,12 +276,15 @@ void stableSortBlock(Block & block, const SortDescription & description) if (!block) return; - IColumn::Permutation perm; - stableGetPermutation(block, description, perm); + IColumn::Permutation permutation; + stableGetPermutation(block, description, permutation); size_t columns = block.columns(); for (size_t i = 0; i < columns; ++i) - block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->permute(perm, 0); + { + auto & column_to_sort = block.safeGetByPosition(i).column; + column_to_sort = column_to_sort->permute(permutation, 0); + } } } diff --git a/src/Interpreters/sortBlock.h b/src/Interpreters/sortBlock.h index faf9384901b..31ae78e90b0 100644 --- a/src/Interpreters/sortBlock.h +++ b/src/Interpreters/sortBlock.h @@ -10,7 +10,6 @@ namespace DB /// Sort one block by `description`. If limit != 0, then the partial sort of the first `limit` rows is produced. void sortBlock(Block & block, const SortDescription & description, UInt64 limit = 0); - /** Used only in StorageMergeTree to sort the data with INSERT. * Sorting is stable. This is important for keeping the order of rows in the CollapsingMergeTree engine * - because based on the order of rows it is determined whether to delete or leave groups of rows when collapsing. @@ -23,24 +22,9 @@ void stableSortBlock(Block & block, const SortDescription & description); */ void stableGetPermutation(const Block & block, const SortDescription & description, IColumn::Permutation & out_permutation); - /** Quickly check whether the block is already sorted. If the block is not sorted - returns false as fast as possible. * Collations are not supported. */ bool isAlreadySorted(const Block & block, const SortDescription & description); -/// Column with description for sort -struct ColumnWithSortDescription -{ - const IColumn * column = nullptr; - SortColumnDescription description; - - /// It means, that this column is ColumnConst - bool column_const = false; -}; - -using ColumnsWithSortDescriptions = std::vector; - -ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description); - } diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index 36731a3acd0..f9a5c7be75f 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -32,6 +32,15 @@ void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr) const if (name == "view") throw Exception("Table function view cannot be used as an expression", ErrorCodes::UNEXPECTED_EXPRESSION); + /// If function can be converted to literal it will be parsed as literal after formatting. + /// In distributed query it may lead to mismathed column names. + /// To avoid it we check whether we can convert function to literal. + if (auto literal = toLiteral()) + { + literal->appendColumnName(ostr); + return; + } + writeString(name, ostr); if (parameters) @@ -111,31 +120,42 @@ void ASTFunction::updateTreeHashImpl(SipHash & hash_state) const IAST::updateTreeHashImpl(hash_state); } +template +static ASTPtr createLiteral(const ASTs & arguments) +{ + Container container; + + for (const auto & arg : arguments) + { + if (const auto * literal = arg->as()) + { + container.push_back(literal->value); + } + else if (auto * func = arg->as()) + { + if (auto func_literal = func->toLiteral()) + container.push_back(func_literal->as()->value); + else + return {}; + } + else + /// Some of the Array or Tuple arguments is not literal + return {}; + } + + return std::make_shared(container); +} ASTPtr ASTFunction::toLiteral() const { - if (!arguments) return {}; + if (!arguments) + return {}; if (name == "array") - { - Array array; + return createLiteral(arguments->children); - for (const auto & arg : arguments->children) - { - if (auto * literal = arg->as()) - array.push_back(literal->value); - else if (auto * func = arg->as()) - { - if (auto func_literal = func->toLiteral()) - array.push_back(func_literal->as()->value); - } - else - /// Some of the Array arguments is not literal - return {}; - } - - return std::make_shared(array); - } + if (name == "tuple") + return createLiteral(arguments->children); return {}; } diff --git a/src/Parsers/ParserDictionary.cpp b/src/Parsers/ParserDictionary.cpp index 399dda08911..ef914e2264a 100644 --- a/src/Parsers/ParserDictionary.cpp +++ b/src/Parsers/ParserDictionary.cpp @@ -188,8 +188,19 @@ bool ParserDictionary::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ASTPtr ast_settings; /// Primary is required to be the first in dictionary definition - if (primary_key_keyword.ignore(pos) && !expression_list_p.parse(pos, primary_key, expected)) - return false; + if (primary_key_keyword.ignore(pos)) + { + bool was_open = false; + + if (open.ignore(pos, expected)) + was_open = true; + + if (!expression_list_p.parse(pos, primary_key, expected)) + return false; + + if (was_open && !close.ignore(pos, expected)) + return false; + } /// Loop is used to avoid strict order of dictionary properties while (true) diff --git a/src/Processors/Executors/CompletedPipelineExecutor.cpp b/src/Processors/Executors/CompletedPipelineExecutor.cpp index 45b02cba298..8ec1916f4ce 100644 --- a/src/Processors/Executors/CompletedPipelineExecutor.cpp +++ b/src/Processors/Executors/CompletedPipelineExecutor.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -40,11 +39,6 @@ static void threadFunction(CompletedPipelineExecutor::Data & data, ThreadGroupSt if (thread_group) CurrentThread::attachTo(thread_group); - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); - data.executor->execute(num_threads); } catch (...) diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index e722f8718f7..80aacf14fe6 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -301,11 +301,6 @@ void PipelineExecutor::executeImpl(size_t num_threads) if (thread_group) CurrentThread::attachTo(thread_group); - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); - try { executeSingleThread(thread_num); diff --git a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp index 0ba07df95a6..198d5ce5d8d 100644 --- a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp +++ b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp @@ -4,9 +4,7 @@ #include #include #include - #include -#include namespace DB { @@ -77,11 +75,6 @@ static void threadFunction(PullingAsyncPipelineExecutor::Data & data, ThreadGrou if (thread_group) CurrentThread::attachTo(thread_group); - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); - data.executor->execute(num_threads); } catch (...) diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp index 68898bdc2c2..6c2e62b77dc 100644 --- a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp +++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp @@ -2,11 +2,8 @@ #include #include #include -#include - #include #include -#include #include namespace DB @@ -107,11 +104,6 @@ static void threadFunction(PushingAsyncPipelineExecutor::Data & data, ThreadGrou if (thread_group) CurrentThread::attachTo(thread_group); - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); - data.executor->execute(num_threads); } catch (...) diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index 692f17f843a..60408f13ff0 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -93,6 +93,7 @@ void registerOutputFormatArrow(FormatFactory & factory) { return std::make_shared(buf, sample, false, format_settings); }); + factory.markFormatHasNoAppendSupport("Arrow"); factory.registerOutputFormat( "ArrowStream", @@ -103,6 +104,7 @@ void registerOutputFormatArrow(FormatFactory & factory) { return std::make_shared(buf, sample, true, format_settings); }); + factory.markFormatHasNoAppendSupport("ArrowStream"); } } diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index fb3389475ac..70373480920 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -479,6 +479,7 @@ void registerOutputFormatAvro(FormatFactory & factory) { return std::make_shared(buf, sample, params, settings); }); + factory.markFormatHasNoAppendSupport("Avro"); } } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowOutputFormat.cpp index 21cb549d4cb..4c8cf19b923 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowOutputFormat.cpp @@ -91,6 +91,11 @@ void registerOutputFormatCustomSeparated(FormatFactory & factory) }); factory.markOutputFormatSupportsParallelFormatting(format_name); + + factory.registerAppendSupportChecker(format_name, [](const FormatSettings & settings) + { + return settings.custom.result_after_delimiter.empty(); + }); }; registerWithNamesAndTypes("CustomSeparated", register_func); diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 263702ad20f..dcab55743cb 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -188,6 +188,16 @@ JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() { + if (first_row) + first_row = false; + else + { + skipWhitespaceIfAny(in); + /// ',' and ';' are possible between the rows. + if (!in.eof() && (*in.position() == ',' || *in.position() == ';')) + ++in.position(); + } + skipWhitespaceIfAny(in); if (in.eof()) return {}; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index 0551aa8b64e..efa0604fc6c 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -81,6 +81,7 @@ private: DataTypes readRowAndGetDataTypes() override; JSONCompactEachRowFormatReader reader; + bool first_row = true; }; } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 82cf44890a9..6d546a3b772 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -318,9 +318,24 @@ std::unordered_map JSONEachRowSchemaReader::readRowAndGetNa { skipBOMIfExists(in); skipWhitespaceIfAny(in); - checkChar('[', in); + if (checkChar('[', in)) + data_in_square_brackets = true; first_row = false; } + else + { + skipWhitespaceIfAny(in); + /// If data is in square brackets then ']' means the end of data. + if (data_in_square_brackets && checkChar(']', in)) + return {}; + + /// ';' means end of data. + if (checkChar(';', in)) + return {}; + + /// There may be optional ',' between rows. + checkChar(',', in); + } skipWhitespaceIfAny(in); if (in.eof()) diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 323909a7730..c711d3ef246 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -95,6 +95,7 @@ private: bool json_strings; bool first_row = true; + bool data_in_square_brackets = false; }; } diff --git a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp index 8e2b2617c4c..8130b2b4cb1 100644 --- a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp @@ -284,6 +284,7 @@ void registerOutputFormatJSON(FormatFactory & factory) }); factory.markOutputFormatSupportsParallelFormatting("JSON"); + factory.markFormatHasNoAppendSupport("JSON"); factory.registerOutputFormat("JSONStrings", []( WriteBuffer & buf, @@ -295,6 +296,7 @@ void registerOutputFormatJSON(FormatFactory & factory) }); factory.markOutputFormatSupportsParallelFormatting("JSONStrings"); + factory.markFormatHasNoAppendSupport("JSONStrings"); } } diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index 19e2ede6b65..bd95cfd6376 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -15,21 +15,22 @@ namespace DB class NativeInputFormat final : public IInputFormat { public: - NativeInputFormat(ReadBuffer & buf, const Block & header) - : IInputFormat(header, buf) - , reader(buf, header, 0) {} + NativeInputFormat(ReadBuffer & buf, const Block & header_) + : IInputFormat(header_, buf) + , reader(std::make_unique(buf, header_, 0)) + , header(header_) {} String getName() const override { return "Native"; } void resetParser() override { IInputFormat::resetParser(); - reader.resetParser(); + reader->resetParser(); } Chunk generate() override { - auto block = reader.read(); + auto block = reader->read(); if (!block) return {}; @@ -40,8 +41,15 @@ public: return Chunk(block.getColumns(), num_rows); } + void setReadBuffer(ReadBuffer & in_) override + { + reader = std::make_unique(in_, header, 0); + IInputFormat::setReadBuffer(in_); + } + private: - NativeReader reader; + std::unique_ptr reader; + Block header; }; class NativeOutputFormat final : public IOutputFormat diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 651b9545c81..106b71a9df5 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -526,6 +526,7 @@ void registerOutputFormatORC(FormatFactory & factory) { return std::make_shared(buf, sample, format_settings); }); + factory.markFormatHasNoAppendSupport("ORC"); } } diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp index 213226c9d68..bfdb9de7d26 100644 --- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp @@ -2,17 +2,12 @@ #include #include #include -#include namespace DB { void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group) { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); if (thread_group) CurrentThread::attachTo(thread_group); @@ -59,12 +54,8 @@ void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number) { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); if (thread_group) - CurrentThread::attachTo(thread_group); + CurrentThread::attachToIfDetached(thread_group); const auto parser_unit_number = current_ticket_number % processing_units.size(); auto & unit = processing_units[parser_unit_number]; diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index a10858ee668..68e2ae1c6eb 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -85,6 +85,7 @@ void registerOutputFormatParquet(FormatFactory & factory) { return std::make_shared(buf, sample, format_settings); }); + factory.markFormatHasNoAppendSupport("Parquet"); } } diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index d981b92e1dd..5c5b99f61da 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -235,5 +235,19 @@ void registerOutputFormatTemplate(FormatFactory & factory) return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); }); + + factory.registerAppendSupportChecker("Template", [](const FormatSettings & settings) + { + if (settings.template_settings.resultset_format.empty()) + return true; + auto resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, + settings.schema.is_server, settings.schema.format_schema_path), + [&](const String & partName) + { + return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); + }); + return resultset_format.delimiters.empty() || resultset_format.delimiters.back().empty(); + }); } } diff --git a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp index d96981fc091..cc2b37189f9 100644 --- a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp @@ -256,6 +256,7 @@ void registerOutputFormatXML(FormatFactory & factory) }); factory.markOutputFormatSupportsParallelFormatting("XML"); + factory.markFormatHasNoAppendSupport("XML"); } } diff --git a/src/Processors/Sources/MySQLSource.cpp b/src/Processors/Sources/MySQLSource.cpp index b0cb62340e9..538aba9d1f3 100644 --- a/src/Processors/Sources/MySQLSource.cpp +++ b/src/Processors/Sources/MySQLSource.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace DB @@ -145,8 +146,7 @@ namespace break; case ValueType::vtUInt64: { - //we don't have enum enum_field_types definition in mysqlxx/Types.h, so we use literal values directly here. - if (static_cast(mysql_type) == 16) + if (mysql_type == enum_field_types::MYSQL_TYPE_BIT) { size_t n = value.size(); UInt64 val = 0UL; @@ -175,9 +175,32 @@ namespace read_bytes_size += 4; break; case ValueType::vtInt64: - assert_cast(column).insertValue(value.getInt()); - read_bytes_size += 8; + { + if (mysql_type == enum_field_types::MYSQL_TYPE_TIME) + { + String time_str(value.data(), value.size()); + bool negative = time_str.starts_with("-"); + if (negative) time_str = time_str.substr(1); + std::vector hhmmss; + boost::split(hhmmss, time_str, [](char c) { return c == ':'; }); + Int64 v = 0; + if (hhmmss.size() == 3) + { + v = (std::stoi(hhmmss[0]) * 3600 + std::stoi(hhmmss[1]) * 60 + std::stold(hhmmss[2])) * 1000000; + } + else + throw Exception("Unsupported value format", ErrorCodes::NOT_IMPLEMENTED); + if (negative) v = -v; + assert_cast(column).insertValue(v); + read_bytes_size += value.size(); + } + else + { + assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 8; + } break; + } case ValueType::vtFloat32: assert_cast(column).insertValue(value.getDouble()); read_bytes_size += 4; diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index fae1ede1f9c..857f362c4be 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -255,6 +255,8 @@ void AggregatingInOrderTransform::generate() res.getByPosition(i + res_key_columns.size()).column = std::move(res_aggregate_columns[i]); to_push_chunk = convertToChunk(res); + if (!to_push_chunk.getNumRows()) + return; /// Clear arenas to allow to free them, when chunk will reach the end of pipeline. /// It's safe clear them here, because columns with aggregate functions already holds them. diff --git a/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp b/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp index e1fa965f025..d01a809e666 100644 --- a/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp +++ b/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp @@ -1,9 +1,9 @@ #include - -#include #include #include +#include #include +#include namespace DB { @@ -250,22 +250,30 @@ void GroupingAggregatedTransform::addChunk(Chunk chunk, size_t input) if (!info) throw Exception("Chunk info was not set for chunk in GroupingAggregatedTransform.", ErrorCodes::LOGICAL_ERROR); - const auto * agg_info = typeid_cast(info.get()); - if (!agg_info) - throw Exception("Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform.", ErrorCodes::LOGICAL_ERROR); + if (const auto * agg_info = typeid_cast(info.get())) + { + Int32 bucket = agg_info->bucket_num; + bool is_overflows = agg_info->is_overflows; - Int32 bucket = agg_info->bucket_num; - bool is_overflows = agg_info->is_overflows; - - if (is_overflows) - overflow_chunks.emplace_back(std::move(chunk)); - else if (bucket < 0) + if (is_overflows) + overflow_chunks.emplace_back(std::move(chunk)); + else if (bucket < 0) + single_level_chunks.emplace_back(std::move(chunk)); + else + { + chunks_map[bucket].emplace_back(std::move(chunk)); + has_two_level = true; + last_bucket_number[input] = bucket; + } + } + else if (const auto * in_order_info = typeid_cast(info.get())) + { single_level_chunks.emplace_back(std::move(chunk)); + } else { - chunks_map[bucket].emplace_back(std::move(chunk)); - has_two_level = true; - last_bucket_number[input] = bucket; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Chunk should have AggregatedChunkInfo/ChunkInfoWithAllocatedBytes in GroupingAggregatedTransform."); } } @@ -318,16 +326,27 @@ void MergingAggregatedBucketTransform::transform(Chunk & chunk) throw Exception("Chunk info was not set for chunk in MergingAggregatedBucketTransform.", ErrorCodes::LOGICAL_ERROR); - const auto * agg_info = typeid_cast(cur_info.get()); - if (!agg_info) - throw Exception("Chunk should have AggregatedChunkInfo in MergingAggregatedBucketTransform.", - ErrorCodes::LOGICAL_ERROR); + if (const auto * agg_info = typeid_cast(cur_info.get())) + { + Block block = header.cloneWithColumns(cur_chunk.detachColumns()); + block.info.is_overflows = agg_info->is_overflows; + block.info.bucket_num = agg_info->bucket_num; - Block block = header.cloneWithColumns(cur_chunk.detachColumns()); - block.info.is_overflows = agg_info->is_overflows; - block.info.bucket_num = agg_info->bucket_num; + blocks_list.emplace_back(std::move(block)); + } + else if (const auto * in_order_info = typeid_cast(cur_info.get())) + { + Block block = header.cloneWithColumns(cur_chunk.detachColumns()); + block.info.is_overflows = false; + block.info.bucket_num = -1; - blocks_list.emplace_back(std::move(block)); + blocks_list.emplace_back(std::move(block)); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Chunk should have AggregatedChunkInfo/ChunkInfoWithAllocatedBytes in MergingAggregatedBucketTransform."); + } } auto res_info = std::make_shared(); @@ -379,7 +398,8 @@ void SortingAggregatedTransform::addChunk(Chunk chunk, size_t from_input) const auto * agg_info = typeid_cast(info.get()); if (!agg_info) - throw Exception("Chunk should have AggregatedChunkInfo in SortingAggregatedTransform.", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Chunk should have AggregatedChunkInfo in SortingAggregatedTransform."); Int32 bucket = agg_info->bucket_num; bool is_overflows = agg_info->is_overflows; @@ -389,8 +409,10 @@ void SortingAggregatedTransform::addChunk(Chunk chunk, size_t from_input) else { if (chunks[bucket]) - throw Exception("SortingAggregatedTransform already got bucket with number " + toString(bucket), - ErrorCodes::LOGICAL_ERROR); + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "SortingAggregatedTransform already got bucket with number {}", bucket); + } chunks[bucket] = std::move(chunk); last_bucket_number[from_input] = bucket; diff --git a/src/Processors/Transforms/MergingAggregatedTransform.cpp b/src/Processors/Transforms/MergingAggregatedTransform.cpp index ddc58d830da..37419f55aae 100644 --- a/src/Processors/Transforms/MergingAggregatedTransform.cpp +++ b/src/Processors/Transforms/MergingAggregatedTransform.cpp @@ -23,7 +23,11 @@ void MergingAggregatedTransform::consume(Chunk chunk) LOG_TRACE(log, "Reading blocks of partially aggregated data."); } - total_input_rows += chunk.getNumRows(); + size_t input_rows = chunk.getNumRows(); + if (!input_rows) + return; + + total_input_rows += input_rows; ++total_input_blocks; const auto & info = chunk.getChunkInfo(); diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index e7da63c6927..07964c29577 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -287,7 +287,7 @@ Poco::Timespan KeeperTCPHandler::receiveHandshake(int32_t handshake_length) void KeeperTCPHandler::runImpl() { - setThreadName("TstKprHandler"); + setThreadName("KeeperHandler"); ThreadStatus thread_status; auto global_receive_timeout = global_context->getSettingsRef().receive_timeout; auto global_send_timeout = global_context->getSettingsRef().send_timeout; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 8a8f3ebde96..13e8597e2f2 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -14,9 +14,8 @@ #include #include -#include +#include -#include #include #include #include @@ -28,7 +27,6 @@ #include #include - #include #include #include @@ -52,7 +50,9 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ACCESS_DENIED; + extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; } namespace @@ -139,20 +139,23 @@ StorageHDFS::StorageHDFS( ASTPtr partition_by_) : IStorage(table_id_) , WithContext(context_) - , uri(uri_) + , uris({uri_}) , format_name(format_name_) , compression_method(compression_method_) , distributed_processing(distributed_processing_) , partition_by(partition_by_) { - context_->getRemoteHostFilter().checkURL(Poco::URI(uri)); - checkHDFSURL(uri); + context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); + checkHDFSURL(uri_); + + String path = uri_.substr(uri_.find('/', uri_.find("//") + 2)); + is_path_with_globs = path.find_first_of("*?{") != std::string::npos; StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri, compression_method, context_); + auto columns = getTableStructureFromData(format_name, uri_, compression_method, context_); storage_metadata.setColumns(columns); } else @@ -217,6 +220,39 @@ private: Strings::iterator uris_iter; }; +class HDFSSource::URISIterator::Impl +{ +public: + explicit Impl(const std::vector & uris_, ContextPtr context) + { + auto path_and_uri = getPathFromUriAndUriWithoutPath(uris_[0]); + HDFSBuilderWrapper builder = createHDFSBuilder(path_and_uri.second + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + for (const auto & uri : uris_) + { + path_and_uri = getPathFromUriAndUriWithoutPath(uri); + if (!hdfsExists(fs.get(), path_and_uri.first.c_str())) + uris.push_back(uri); + } + uris_iter = uris.begin(); + } + + String next() + { + std::lock_guard lock(mutex); + if (uris_iter == uris.end()) + return ""; + auto key = *uris_iter; + ++uris_iter; + return key; + } + +private: + std::mutex mutex; + Strings uris; + Strings::iterator uris_iter; +}; + Block HDFSSource::getHeader(const StorageMetadataPtr & metadata_snapshot, bool need_path_column, bool need_file_column) { auto header = metadata_snapshot->getSampleBlock(); @@ -249,6 +285,15 @@ String HDFSSource::DisclosedGlobIterator::next() return pimpl->next(); } +HDFSSource::URISIterator::URISIterator(const std::vector & uris_, ContextPtr context) + : pimpl(std::make_shared(uris_, context)) +{ +} + +String HDFSSource::URISIterator::next() +{ + return pimpl->next(); +} HDFSSource::HDFSSource( StorageHDFSPtr storage_, @@ -283,9 +328,8 @@ bool HDFSSource::initialize() current_path = (*file_iterator)(); if (current_path.empty()) return false; - const size_t begin_of_path = current_path.find('/', current_path.find("//") + 2); - const String path_from_uri = current_path.substr(begin_of_path); - const String uri_without_path = current_path.substr(0, begin_of_path); + + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression); @@ -468,15 +512,23 @@ Pipe StorageHDFS::read( return callback(); }); } - else + else if (is_path_with_globs) { /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(context_, uri); + auto glob_iterator = std::make_shared(context_, uris[0]); iterator_wrapper = std::make_shared([glob_iterator]() { return glob_iterator->next(); }); } + else + { + auto uris_iterator = std::make_shared(uris, context_); + iterator_wrapper = std::make_shared([uris_iterator]() + { + return uris_iterator->next(); + }); + } Pipes pipes; auto this_ptr = std::static_pointer_cast(shared_from_this()); @@ -503,9 +555,11 @@ Pipe StorageHDFS::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) +SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_) { - bool has_wildcards = uri.find(PartitionedSink::PARTITION_ID_WILDCARD) != String::npos; + String current_uri = uris.back(); + + bool has_wildcards = current_uri.find(PartitionedSink::PARTITION_ID_WILDCARD) != String::npos; const auto * insert_query = dynamic_cast(query.get()); auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; bool is_partitioned_implementation = partition_by_ast && has_wildcards; @@ -514,34 +568,70 @@ SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataP { return std::make_shared( partition_by_ast, - uri, + current_uri, format_name, metadata_snapshot->getSampleBlock(), - getContext(), - chooseCompressionMethod(uri, compression_method)); + context_, + chooseCompressionMethod(current_uri, compression_method)); } else { - return std::make_shared(uri, + if (is_path_with_globs) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs, so the table is in readonly mode", uris.back()); + + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_uri); + + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + bool truncate_on_insert = context_->getSettingsRef().hdfs_truncate_on_insert; + if (!truncate_on_insert && !hdfsExists(fs.get(), path_from_uri.c_str())) + { + if (context_->getSettingsRef().hdfs_create_new_file_on_insert) + { + auto pos = uris[0].find_first_of('.', uris[0].find_last_of('/')); + size_t index = uris.size(); + String new_uri; + do + { + new_uri = uris[0].substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : uris[0].substr(pos)); + ++index; + } + while (!hdfsExists(fs.get(), new_uri.c_str())); + uris.push_back(new_uri); + current_uri = new_uri; + } + else + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "File with path {} already exists. If you want to overwrite it, enable setting hdfs_truncate_on_insert, " + "if you want to create new file on each insert, enable setting hdfs_create_new_file_on_insert", + path_from_uri); + } + + return std::make_shared(current_uri, format_name, metadata_snapshot->getSampleBlock(), - getContext(), - chooseCompressionMethod(uri, compression_method)); + context_, + chooseCompressionMethod(current_uri, compression_method)); } } void StorageHDFS::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); - const String path = uri.substr(begin_of_path); - const String url = uri.substr(0, begin_of_path); + const size_t begin_of_path = uris[0].find('/', uris[0].find("//") + 2); + const String url = uris[0].substr(0, begin_of_path); HDFSBuilderWrapper builder = createHDFSBuilder(url + "/", local_context->getGlobalContext()->getConfigRef()); HDFSFSPtr fs = createHDFSFS(builder.get()); - int ret = hdfsDelete(fs.get(), path.data(), 0); - if (ret) - throw Exception(ErrorCodes::ACCESS_DENIED, "Unable to truncate hdfs table: {}", std::string(hdfsGetLastError())); + for (const auto & uri : uris) + { + const String path = uri.substr(begin_of_path); + int ret = hdfsDelete(fs.get(), path.data(), 0); + if (ret) + throw Exception(ErrorCodes::ACCESS_DENIED, "Unable to truncate hdfs table: {}", std::string(hdfsGetLastError())); + } } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 76cb74da935..ff487d93b92 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -31,7 +31,7 @@ public: size_t max_block_size, unsigned num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; void truncate( const ASTPtr & query, @@ -70,11 +70,12 @@ protected: ASTPtr partition_by = nullptr); private: - const String uri; + std::vector uris; String format_name; String compression_method; const bool distributed_processing; ASTPtr partition_by; + bool is_path_with_globs; Poco::Logger * log = &Poco::Logger::get("StorageHDFS"); }; @@ -95,6 +96,17 @@ public: std::shared_ptr pimpl; }; + class URISIterator + { + public: + URISIterator(const std::vector & uris_, ContextPtr context); + String next(); + private: + class Impl; + /// shared_ptr to have copy constructor + std::shared_ptr pimpl; + }; + using IteratorWrapper = std::function; using StorageHDFSPtr = std::shared_ptr; diff --git a/src/Storages/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/HDFS/WriteBufferFromHDFS.cpp index 9f5e3c1f7d2..2addfc0069f 100644 --- a/src/Storages/HDFS/WriteBufferFromHDFS.cpp +++ b/src/Storages/HDFS/WriteBufferFromHDFS.cpp @@ -15,7 +15,6 @@ namespace ErrorCodes extern const int NETWORK_ERROR; extern const int CANNOT_OPEN_FILE; extern const int CANNOT_FSYNC; -extern const int BAD_ARGUMENTS; } @@ -38,12 +37,6 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); const String path = hdfs_uri.substr(begin_of_path); - if (path.find_first_of("*?{") != std::string::npos) - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "URI '{}' contains globs, so the table is in readonly mode", hdfs_uri); - - if (!hdfsExists(fs.get(), path.c_str())) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "File {} already exists", path); - fout = hdfsOpenFile(fs.get(), path.c_str(), flags, 0, replication_, 0); /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here if (fout == nullptr) diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 2f52a7d5490..9e106d8fbdc 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -218,7 +218,6 @@ public: /// Extract data from the backup and put it to the storage. virtual RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context); -protected: /// Returns whether the column is virtual - by default all columns are real. /// Initially reserved virtual column name may be shadowed by real column. bool isVirtualColumn(const String & column_name, const StorageMetadataPtr & metadata_snapshot) const; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index e4ca3e00c0f..96727784046 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1,6 +1,7 @@ #include "IMergeTreeDataPart.h" #include +#include #include #include #include @@ -416,7 +417,7 @@ std::pair IMergeTreeDataPart::getMinMaxTime() const } -void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos) +void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns) { columns = new_columns; @@ -425,21 +426,12 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const size_t pos = 0; for (const auto & column : columns) - { column_name_to_position.emplace(column.name, pos++); +} - auto it = new_infos.find(column.name); - if (it != new_infos.end()) - { - auto & old_info = serialization_infos[column.name]; - const auto & new_info = it->second; - - if (old_info) - old_info->replaceData(*new_info); - else - old_info = new_info->clone(); - } - } +void IMergeTreeDataPart::setSerializationInfos(const SerializationInfoByName & new_infos) +{ + serialization_infos = new_infos; } SerializationPtr IMergeTreeDataPart::getSerialization(const NameAndTypePair & column) const @@ -1098,7 +1090,8 @@ void IMergeTreeDataPart::loadColumns(bool require) if (volume->getDisk()->exists(path)) infos.readJSON(*volume->getDisk()->readFile(path)); - setColumns(loaded_columns, infos); + setColumns(loaded_columns); + setSerializationInfos(infos); } bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const @@ -1638,13 +1631,21 @@ UInt32 IMergeTreeDataPart::getNumberOfRefereneces() const } -String IMergeTreeDataPart::getZeroLevelPartBlockID() const +String IMergeTreeDataPart::getZeroLevelPartBlockID(std::string_view token) const { if (info.level != 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get block id for non zero level part {}", name); SipHash hash; - checksums.computeTotalChecksumDataOnly(hash); + if (token.empty()) + { + checksums.computeTotalChecksumDataOnly(hash); + } + else + { + hash.update(token.data(), token.size()); + } + union { char bytes[16]; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 09449dc7521..c96cad4b039 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -128,11 +128,14 @@ public: String getTypeName() const { return getType().toString(); } - void setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos = {}); + void setColumns(const NamesAndTypesList & new_columns); const NamesAndTypesList & getColumns() const { return columns; } + + void setSerializationInfos(const SerializationInfoByName & new_infos); + const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } - SerializationInfoByName & getSerializationInfos() { return serialization_infos; } + SerializationPtr getSerialization(const NameAndTypePair & column) const; /// Throws an exception if part is not stored in on-disk format. @@ -174,7 +177,8 @@ public: bool isEmpty() const { return rows_count == 0; } /// Compute part block id for zero level part. Otherwise throws an exception. - String getZeroLevelPartBlockID() const; + /// If token is not empty, block id is calculated based on it instead of block data + String getZeroLevelPartBlockID(std::string_view token) const; const MergeTreeData & storage; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index a49e775b6f4..a1155de7921 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -191,7 +191,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() infos.add(part->getSerializationInfos()); } - global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos); + global_ctx->new_data_part->setColumns(global_ctx->storage_columns); + global_ctx->new_data_part->setSerializationInfos(infos); const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl; if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b38b6d3b9b2..5d6cbbe76c1 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -70,7 +70,6 @@ #include #include -#include #include #include @@ -1593,12 +1592,8 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_re { pool.scheduleOrThrowOnError([&, thread_group = CurrentThread::getGroup()] { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachQueryIfNotDetached(); - ); if (thread_group) - CurrentThread::attachTo(thread_group); + CurrentThread::attachToIfDetached(thread_group); LOG_DEBUG(log, "Removing part from filesystem {}", part->name); part->remove(); @@ -2445,7 +2440,12 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( } -bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, MergeTreeDeduplicationLog * deduplication_log) +bool MergeTreeData::renameTempPartAndAdd( + MutableDataPartPtr & part, + SimpleIncrement * increment, + Transaction * out_transaction, + MergeTreeDeduplicationLog * deduplication_log, + std::string_view deduplication_token) { if (out_transaction && &out_transaction->data != this) throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", @@ -2454,7 +2454,7 @@ bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrem DataPartsVector covered_parts; { auto lock = lockParts(); - if (!renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log)) + if (!renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log, deduplication_token)) return false; } if (!covered_parts.empty()) @@ -2466,8 +2466,13 @@ bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrem bool MergeTreeData::renameTempPartAndReplace( - MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, - std::unique_lock & lock, DataPartsVector * out_covered_parts, MergeTreeDeduplicationLog * deduplication_log) + MutableDataPartPtr & part, + SimpleIncrement * increment, + Transaction * out_transaction, + std::unique_lock & lock, + DataPartsVector * out_covered_parts, + MergeTreeDeduplicationLog * deduplication_log, + std::string_view deduplication_token) { if (out_transaction && &out_transaction->data != this) throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", @@ -2529,7 +2534,7 @@ bool MergeTreeData::renameTempPartAndReplace( /// deduplication. if (deduplication_log) { - String block_id = part->getZeroLevelPartBlockID(); + String block_id = part->getZeroLevelPartBlockID(deduplication_token); auto res = deduplication_log->addPart(block_id, part_info); if (!res.second) { diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 366a8157289..9cac8ad58de 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -499,7 +499,12 @@ public: /// active set later with out_transaction->commit()). /// Else, commits the part immediately. /// Returns true if part was added. Returns false if part is covered by bigger part. - bool renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrement * increment = nullptr, Transaction * out_transaction = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr); + bool renameTempPartAndAdd( + MutableDataPartPtr & part, + SimpleIncrement * increment = nullptr, + Transaction * out_transaction = nullptr, + MergeTreeDeduplicationLog * deduplication_log = nullptr, + std::string_view deduplication_token = std::string_view()); /// The same as renameTempPartAndAdd but the block range of the part can contain existing parts. /// Returns all parts covered by the added part (in ascending order). @@ -509,9 +514,13 @@ public: /// Low-level version of previous one, doesn't lock mutex bool renameTempPartAndReplace( - MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr); - + MutableDataPartPtr & part, + SimpleIncrement * increment, + Transaction * out_transaction, + DataPartsLock & lock, + DataPartsVector * out_covered_parts = nullptr, + MergeTreeDeduplicationLog * deduplication_log = nullptr, + std::string_view deduplication_token = std::string_view()); /// Remove parts from working set immediately (without wait for background /// process). Transfer part state to temporary. Have very limited usage only diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index cb8ab89d34e..d5904ba9840 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1,5 +1,4 @@ #include /// For calculations related to sampling coefficients. -#include #include #include @@ -991,9 +990,8 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd for (size_t part_index = 0; part_index < parts.size(); ++part_index) pool.scheduleOrThrowOnError([&, part_index, thread_group = CurrentThread::getGroup()] { - SCOPE_EXIT_SAFE(if (thread_group) CurrentThread::detachQueryIfNotDetached();); if (thread_group) - CurrentThread::attachTo(thread_group); + CurrentThread::attachToIfDetached(thread_group); process_part(part_index); }); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 01b3d0e67e2..6308074d0e7 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -376,7 +376,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( SerializationInfoByName infos(columns, settings); infos.add(block); - new_data_part->setColumns(columns, infos); + new_data_part->setColumns(columns); + new_data_part->setSerializationInfos(infos); new_data_part->rows_count = block.rows(); new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); @@ -474,7 +475,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl( SerializationInfoByName infos(columns, settings); infos.add(block); - new_data_part->setColumns(columns, infos); + new_data_part->setColumns(columns); + new_data_part->setSerializationInfos(infos); if (new_data_part->isStoredOnDisk()) { diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 5e97f80d849..3029fc41bd3 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -18,6 +18,7 @@ void MergeTreeSink::onStart() void MergeTreeSink::consume(Chunk chunk) { auto block = getHeader().cloneWithColumns(chunk.detachColumns()); + String block_dedup_token; auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); for (auto & current_block : part_blocks) @@ -31,8 +32,20 @@ void MergeTreeSink::consume(Chunk chunk) if (!part) continue; + if (storage.getDeduplicationLog()) + { + const String & dedup_token = context->getSettingsRef().insert_deduplication_token; + if (!dedup_token.empty()) + { + /// multiple blocks can be inserted within the same insert query + /// an ordinal number is added to dedup token to generate a distinctive block id for each block + block_dedup_token = fmt::format("{}_{}", dedup_token, chunk_dedup_seqnum); + ++chunk_dedup_seqnum; + } + } + /// Part can be deduplicated, so increment counters and add to part log only if it's really added - if (storage.renameTempPartAndAdd(part, &storage.increment, nullptr, storage.getDeduplicationLog())) + if (storage.renameTempPartAndAdd(part, &storage.increment, nullptr, storage.getDeduplicationLog(), block_dedup_token)) { PartLog::addNewPart(storage.getContext(), part, watch.elapsed()); diff --git a/src/Storages/MergeTree/MergeTreeSink.h b/src/Storages/MergeTree/MergeTreeSink.h index 60ac62c7592..96231fe668c 100644 --- a/src/Storages/MergeTree/MergeTreeSink.h +++ b/src/Storages/MergeTree/MergeTreeSink.h @@ -36,6 +36,7 @@ private: StorageMetadataPtr metadata_snapshot; size_t max_parts_per_block; ContextPtr context; + uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token }; } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index cbdbb2339df..5274118df29 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -72,21 +72,20 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( projection_part->checksums.getTotalSizeOnDisk(), projection_part->checksums.getTotalChecksumUInt128()); - NamesAndTypesList part_columns; - if (!total_columns_list) - part_columns = columns_list; - else - part_columns = *total_columns_list; + if (reset_columns) + { + auto part_columns = total_columns_list ? *total_columns_list : columns_list; + auto serialization_infos = new_part->getSerializationInfos(); - auto & serialization_infos = reset_columns - ? new_serialization_infos - : new_part->getSerializationInfos(); + serialization_infos.replaceData(new_serialization_infos); + removeEmptyColumnsFromPart(new_part, part_columns, serialization_infos, checksums); + + new_part->setColumns(part_columns); + new_part->setSerializationInfos(serialization_infos); + } if (new_part->isStoredOnDisk()) - finalizePartOnDisk(new_part, part_columns, serialization_infos, checksums, sync); - - if (reset_columns) - new_part->setColumns(part_columns, serialization_infos); + finalizePartOnDisk(new_part, checksums, sync); new_part->rows_count = rows_count; new_part->modification_time = time(nullptr); @@ -102,9 +101,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( } void MergedBlockOutputStream::finalizePartOnDisk( - const MergeTreeData::MutableDataPartPtr & new_part, - NamesAndTypesList & part_columns, - SerializationInfoByName & serialization_infos, + const MergeTreeData::DataPartPtr & new_part, MergeTreeData::DataPart::Checksums & checksums, bool sync) { @@ -171,13 +168,11 @@ void MergedBlockOutputStream::finalizePartOnDisk( out->sync(); } - removeEmptyColumnsFromPart(new_part, part_columns, serialization_infos, checksums); - - if (!serialization_infos.empty()) + if (!new_part->getSerializationInfos().empty()) { auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096); HashingWriteBuffer out_hashing(*out); - serialization_infos.writeJSON(out_hashing); + new_part->getSerializationInfos().writeJSON(out_hashing); checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count(); checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash(); out->finalize(); @@ -188,7 +183,7 @@ void MergedBlockOutputStream::finalizePartOnDisk( { /// Write a file with a description of columns. auto out = volume->getDisk()->writeFile(fs::path(part_path) / "columns.txt", 4096); - part_columns.writeText(*out); + new_part->getColumns().writeText(*out); out->finalize(); if (sync) out->sync(); diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index ffc740bf410..21e3c794239 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -47,9 +47,7 @@ private: void writeImpl(const Block & block, const IColumn::Permutation * permutation); void finalizePartOnDisk( - const MergeTreeData::MutableDataPartPtr & new_part, - NamesAndTypesList & part_columns, - SerializationInfoByName & serialization_infos, + const MergeTreeData::DataPartPtr & new_part, MergeTreeData::DataPart::Checksums & checksums, bool sync); diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index ff79a187490..4c43e93e809 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -71,13 +71,17 @@ MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums( projection_part->checksums.getTotalChecksumUInt128()); auto columns = new_part->getColumns(); + auto serialization_infos = new_part->getSerializationInfos(); + serialization_infos.replaceData(new_serialization_infos); - auto removed_files = removeEmptyColumnsFromPart(new_part, columns, new_serialization_infos, checksums); + auto removed_files = removeEmptyColumnsFromPart(new_part, columns, serialization_infos, checksums); for (const String & removed_file : removed_files) if (all_checksums.files.count(removed_file)) all_checksums.files.erase(removed_file); - new_part->setColumns(columns, new_serialization_infos); + new_part->setColumns(columns); + new_part->setSerializationInfos(serialization_infos); + return checksums; } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index c7ce80756ea..985098bb2a3 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1295,7 +1295,8 @@ bool MutateTask::prepare() ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->source_part->getSerializationInfos(), ctx->commands_for_part); - ctx->new_data_part->setColumns(new_columns, new_infos); + ctx->new_data_part->setColumns(new_columns); + ctx->new_data_part->setSerializationInfos(new_infos); ctx->new_data_part->partition.assign(ctx->source_part->partition); ctx->disk = ctx->new_data_part->volume->getDisk(); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 4d24f491551..8b0751f4bbf 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1123,7 +1123,7 @@ bool ReplicatedMergeTreeQueue::addFuturePartIfNotCoveredByThem(const String & pa if (isNotCoveredByFuturePartsImpl(entry, part_name, reject_reason, lock)) { - CurrentlyExecuting::setActualPartName(entry, part_name, *this); + CurrentlyExecuting::setActualPartName(entry, part_name, *this, lock); return true; } @@ -1375,7 +1375,8 @@ Int64 ReplicatedMergeTreeQueue::getCurrentMutationVersion(const String & partiti } -ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_) +ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting( + const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, std::lock_guard & /* state_lock */) : entry(entry_), queue(queue_) { if (entry->type == ReplicatedMergeTreeLogEntry::DROP_RANGE || entry->type == ReplicatedMergeTreeLogEntry::REPLACE_RANGE) @@ -1397,8 +1398,11 @@ ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const Replicate } -void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry, - const String & actual_part_name, ReplicatedMergeTreeQueue & queue) +void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName( + ReplicatedMergeTreeQueue::LogEntry & entry, + const String & actual_part_name, + ReplicatedMergeTreeQueue & queue, + std::lock_guard & /* state_lock */) { if (!entry.actual_new_part_name.empty()) throw Exception("Entry actual part isn't empty yet. This is a bug.", ErrorCodes::LOGICAL_ERROR); @@ -1477,7 +1481,7 @@ ReplicatedMergeTreeQueue::SelectedEntryPtr ReplicatedMergeTreeQueue::selectEntry } if (entry) - return std::make_shared(entry, std::unique_ptr{ new CurrentlyExecuting(entry, *this) }); + return std::make_shared(entry, std::unique_ptr{new CurrentlyExecuting(entry, *this, lock)}); else return {}; } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index 133c154059e..208ce73e5f1 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -251,11 +251,18 @@ private: friend class ReplicatedMergeTreeQueue; /// Created only in the selectEntryToProcess function. It is called under mutex. - CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_); + CurrentlyExecuting( + const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, + ReplicatedMergeTreeQueue & queue_, + std::lock_guard & state_lock); /// In case of fetch, we determine actual part during the execution, so we need to update entry. It is called under state_mutex. - static void setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry, const String & actual_part_name, - ReplicatedMergeTreeQueue & queue); + static void setActualPartName( + ReplicatedMergeTreeQueue::LogEntry & entry, + const String & actual_part_name, + ReplicatedMergeTreeQueue & queue, + std::lock_guard & state_lock); + public: ~CurrentlyExecuting(); }; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index d2bf6ba308b..c14672fe382 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -160,8 +160,16 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) { /// We add the hash from the data and partition identifier to deduplication ID. /// That is, do not insert the same data to the same partition twice. - block_id = part->getZeroLevelPartBlockID(); + String block_dedup_token = context->getSettingsRef().insert_deduplication_token; + if (!block_dedup_token.empty()) + { + /// multiple blocks can be inserted within the same insert query + /// an ordinal number is added to dedup token to generate a distinctive block id for each block + block_dedup_token += fmt::format("_{}", chunk_dedup_seqnum); + ++chunk_dedup_seqnum; + } + block_id = part->getZeroLevelPartBlockID(block_dedup_token); LOG_DEBUG(log, "Wrote block with ID '{}', {} rows", block_id, current_block.block.rows()); } else diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index 7df82fd397e..300791ff25b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -82,13 +82,14 @@ private: bool is_attach = false; bool quorum_parallel = false; - bool deduplicate = true; + const bool deduplicate = true; bool last_block_is_duplicate = false; using Logger = Poco::Logger; Poco::Logger * log; ContextPtr context; + UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token }; } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 30355f8de2f..af4d8509b8f 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -309,7 +309,7 @@ NamesAndTypesList StorageDistributed::getVirtuals() const NameAndTypePair("_part_uuid", std::make_shared()), NameAndTypePair("_partition_id", std::make_shared()), NameAndTypePair("_sample_factor", std::make_shared()), - NameAndTypePair("_shard_num", std::make_shared()), + NameAndTypePair("_shard_num", std::make_shared()), /// deprecated }; } @@ -638,7 +638,7 @@ Pipe StorageDistributed::read( void StorageDistributed::read( QueryPlan & query_plan, - const Names & column_names, + const Names &, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, @@ -668,10 +668,6 @@ void StorageDistributed::read( return; } - bool has_virtual_shard_num_column = std::find(column_names.begin(), column_names.end(), "_shard_num") != column_names.end(); - if (has_virtual_shard_num_column && !isVirtualColumn("_shard_num", storage_snapshot->metadata)) - has_virtual_shard_num_column = false; - StorageID main_table = StorageID::createEmpty(); if (!remote_table_function_ptr) main_table = StorageID{remote_database, remote_table}; @@ -682,8 +678,7 @@ void StorageDistributed::read( header, snapshot_data.objects_by_shard, storage_snapshot, - processed_stage, - has_virtual_shard_num_column); + processed_stage); ClusterProxy::executeQuery( query_plan, header, processed_stage, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 0cd8ea9e2b7..a7a050783cc 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -65,6 +65,7 @@ namespace ErrorCodes extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; extern const int LOGICAL_ERROR; + extern const int CANNOT_APPEND_TO_FILE; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -217,8 +218,33 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user return paths; } +ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr context) +{ + /// If we want to read schema from file descriptor we should create + /// a read buffer from fd, create a checkpoint, read some data required + /// for schema inference, rollback to checkpoint and then use the created + /// peekable read buffer on the first read from storage. It's needed because + /// in case of file descriptor we have a stream of data and we cannot + /// start reading data from the beginning after reading some data for + /// schema inference. + auto read_buffer_creator = [&]() + { + /// We will use PeekableReadBuffer to create a checkpoint, so we need a place + /// where we can store the original read buffer. + read_buffer_from_fd = createReadBuffer("", true, getName(), table_fd, compression_method, context); + auto read_buf = std::make_unique(*read_buffer_from_fd); + read_buf->setCheckpoint(); + return read_buf; + }; -ColumnsDescription StorageFile::getTableStructureFromData( + auto columns = readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, peekable_read_buffer_from_fd); + if (peekable_read_buffer_from_fd) + /// If we have created read buffer in readSchemaFromFormat we should rollback to checkpoint. + assert_cast(peekable_read_buffer_from_fd.get())->rollbackToCheckpoint(); + return columns; +} + +ColumnsDescription StorageFile::getTableStructureFromFile( const String & format, const std::vector & paths, const String & compression_method, @@ -271,8 +297,6 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args) throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); if (args.format_name == "Distributed") throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME); - if (args.columns.empty()) - throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE); is_db_table = false; use_table_fd = true; @@ -285,6 +309,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us { is_db_table = false; paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); + is_path_with_globs = paths.size() > 1; path_for_partitioned_write = table_path_; setStorageMetadata(args); } @@ -321,9 +346,15 @@ void StorageFile::setStorageMetadata(CommonArguments args) if (args.format_name == "Distributed" || args.columns.empty()) { - auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext()); - if (!args.columns.empty() && args.columns != columns) - throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + ColumnsDescription columns; + if (use_table_fd) + columns = getTableStructureFromFileDescriptor(args.getContext()); + else + { + columns = getTableStructureFromFile(format_name, paths, compression_method, format_settings, args.getContext()); + if (!args.columns.empty() && args.columns != columns) + throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + } storage_metadata.setColumns(columns); } else @@ -394,11 +425,13 @@ public: ContextPtr context_, UInt64 max_block_size_, FilesInfoPtr files_info_, - ColumnsDescription columns_description_) + ColumnsDescription columns_description_, + std::unique_ptr read_buf_) : SourceWithProgress(getBlockForSource(storage_, storage_snapshot_, columns_description_, files_info_)) , storage(std::move(storage_)) , storage_snapshot(storage_snapshot_) , files_info(std::move(files_info_)) + , read_buf(std::move(read_buf_)) , columns_description(std::move(columns_description_)) , context(context_) , max_block_size(max_block_size_) @@ -440,7 +473,8 @@ public: } } - read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); + if (!read_buf) + read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); auto get_block_for_format = [&]() -> Block { @@ -586,7 +620,7 @@ Pipe StorageFile::read( }; pipes.emplace_back(std::make_shared( - this_ptr, storage_snapshot, context, max_block_size, files_info, get_columns_for_format())); + this_ptr, storage_snapshot, context, max_block_size, files_info, get_columns_for_format(), std::move(peekable_read_buffer_from_fd))); } return Pipe::unitePipes(std::move(pipes)); @@ -602,7 +636,7 @@ public: int table_fd_, bool use_table_fd_, std::string base_path_, - std::vector paths_, + std::string path_, const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, @@ -614,7 +648,7 @@ public: , table_fd(table_fd_) , use_table_fd(use_table_fd_) , base_path(base_path_) - , paths(paths_) + , path(path_) , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) @@ -631,7 +665,7 @@ public: int table_fd_, bool use_table_fd_, std::string base_path_, - std::vector paths_, + const std::string & path_, const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, @@ -643,7 +677,7 @@ public: , table_fd(table_fd_) , use_table_fd(use_table_fd_) , base_path(base_path_) - , paths(paths_) + , path(path_) , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) @@ -665,10 +699,8 @@ public: } else { - if (paths.size() != 1) - throw Exception("Table '" + table_name_for_log + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED); flags |= O_WRONLY | O_APPEND | O_CREAT; - naked_buffer = std::make_unique(paths[0], DBMS_DEFAULT_BUFFER_SIZE, flags); + naked_buffer = std::make_unique(path, DBMS_DEFAULT_BUFFER_SIZE, flags); } /// In case of formats with prefixes if file is not empty we have already written prefix. @@ -708,7 +740,7 @@ private: int table_fd; bool use_table_fd; std::string base_path; - std::vector paths; + std::string path; CompressionMethod compression_method; std::string format_name; std::optional format_settings; @@ -751,7 +783,6 @@ public: { auto partition_path = PartitionedSink::replaceWildcards(path, partition_id); PartitionedSink::validatePartitionKey(partition_path, true); - Strings result_paths = {partition_path}; checkCreationIsAllowed(context, context->getUserFilesPath(), partition_path); return std::make_shared( metadata_snapshot, @@ -759,7 +790,7 @@ public: -1, /* use_table_fd */false, base_path, - result_paths, + partition_path, compression_method, format_settings, format_name, @@ -793,7 +824,6 @@ SinkToStoragePtr StorageFile::write( int flags = 0; - std::string path; if (context->getSettingsRef().engine_file_truncate_on_insert) flags |= O_TRUNC; @@ -814,7 +844,7 @@ SinkToStoragePtr StorageFile::write( std::unique_lock{rwlock, getLockTimeout(context)}, base_path, path_for_partitioned_write, - chooseCompressionMethod(path, compression_method), + chooseCompressionMethod(path_for_partitioned_write, compression_method), format_settings, format_name, context, @@ -822,10 +852,41 @@ SinkToStoragePtr StorageFile::write( } else { + String path; if (!paths.empty()) { - path = paths[0]; + if (is_path_with_globs) + throw Exception("Table '" + getStorageID().getNameForLogs() + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED); + + path = paths.back(); fs::create_directories(fs::path(path).parent_path()); + + if (!context->getSettingsRef().engine_file_truncate_on_insert && !is_path_with_globs + && !FormatFactory::instance().checkIfFormatSupportAppend(format_name, context, format_settings) && fs::exists(paths.back()) + && fs::file_size(paths.back()) != 0) + { + if (context->getSettingsRef().engine_file_allow_create_multiple_files) + { + auto pos = paths[0].find_first_of('.', paths[0].find_last_of('/')); + size_t index = paths.size(); + String new_path; + do + { + new_path = paths[0].substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : paths[0].substr(pos)); + ++index; + } + while (fs::exists(new_path)); + paths.push_back(new_path); + path = new_path; + } + else + throw Exception( + ErrorCodes::CANNOT_APPEND_TO_FILE, + "Cannot append data in format {} to file, because this format doesn't support appends." + " You can allow to create a new file " + "on each insert by enabling setting engine_file_allow_create_multiple_files", + format_name); + } } return std::make_shared( @@ -835,7 +896,7 @@ SinkToStoragePtr StorageFile::write( table_fd, use_table_fd, base_path, - paths, + path, chooseCompressionMethod(path, compression_method), format_settings, format_name, @@ -881,7 +942,7 @@ void StorageFile::truncate( ContextPtr /* context */, TableExclusiveLockHolder &) { - if (paths.size() != 1) + if (is_path_with_globs) throw Exception("Can't truncate table '" + getStorageID().getNameForLogs() + "' in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); if (use_table_fd) @@ -891,11 +952,14 @@ void StorageFile::truncate( } else { - if (!fs::exists(paths[0])) - return; + for (const auto & path : paths) + { + if (!fs::exists(path)) + continue; - if (0 != ::truncate(paths[0].c_str(), 0)) - throwFromErrnoWithPath("Cannot truncate file " + paths[0], paths[0], ErrorCodes::CANNOT_TRUNCATE_FILE); + if (0 != ::truncate(path.c_str(), 0)) + throwFromErrnoWithPath("Cannot truncate file " + path, path, ErrorCodes::CANNOT_TRUNCATE_FILE); + } } } diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index ca2f51e5073..e6c92386990 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -71,7 +71,9 @@ public: bool supportsPartitionBy() const override { return true; } - static ColumnsDescription getTableStructureFromData( + ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context); + + static ColumnsDescription getTableStructureFromFile( const String & format, const std::vector & paths, const String & compression_method, @@ -120,6 +122,13 @@ private: size_t total_bytes_to_read = 0; String path_for_partitioned_write; + + bool is_path_with_globs = false; + + /// These buffers are needed for schema inference when data source + /// is file descriptor. See getTableStructureFromFileDescriptor. + std::unique_ptr read_buffer_from_fd; + std::unique_ptr peekable_read_buffer_from_fd; }; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 19d7a38b9ab..d52f20d9340 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -774,7 +774,8 @@ void StorageReplicatedMergeTree::drop() /// or metadata of staled replica were removed manually, /// in this case, has_metadata_in_zookeeper = false, and we also permit to drop the table. - if (has_metadata_in_zookeeper) + bool maybe_has_metadata_in_zookeeper = !has_metadata_in_zookeeper.has_value() || *has_metadata_in_zookeeper; + if (maybe_has_metadata_in_zookeeper) { /// Table can be shut down, restarting thread is not active /// and calling StorageReplicatedMergeTree::getZooKeeper()/getAuxiliaryZooKeeper() won't suffice. @@ -4811,12 +4812,22 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St void StorageReplicatedMergeTree::restoreMetadataInZooKeeper() { LOG_INFO(log, "Restoring replica metadata"); + if (!is_readonly) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica must be readonly"); - if (!is_readonly || has_metadata_in_zookeeper) - throw Exception(ErrorCodes::LOGICAL_ERROR, "It's a bug: replica is not readonly"); + if (getZooKeeper()->exists(replica_path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Replica path is present at {} - nothing to restore. " + "If you are sure that metadata is lost and that replica path contains some garbage, " + "then use SYSTEM DROP REPLICA query first.", replica_path); + + if (has_metadata_in_zookeeper.has_value() && *has_metadata_in_zookeeper) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Replica has metadata in ZooKeeper: " + "it's either a bug or it's a result of manual intervention to ZooKeeper"); if (are_restoring_replica.exchange(true)) throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Replica restoration in progress"); + SCOPE_EXIT({ are_restoring_replica.store(false); }); auto metadata_snapshot = getInMemoryMetadataPtr(); @@ -4857,8 +4868,6 @@ void StorageReplicatedMergeTree::restoreMetadataInZooKeeper() LOG_INFO(log, "Attached all partitions, starting table"); startup(); - - are_restoring_replica.store(false); } void StorageReplicatedMergeTree::dropPartNoWaitNoThrow(const String & part_name) diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 2d89a8b8aa0..b56e9a2e965 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -322,8 +322,9 @@ private: /// If true, the table is offline and can not be written to it. std::atomic_bool is_readonly {false}; + /// If nullopt - ZooKeeper is not available, so we don't know if there is table metadata. /// If false - ZooKeeper is available, but there is no table metadata. It's safe to drop table in this case. - bool has_metadata_in_zookeeper = true; + std::optional has_metadata_in_zookeeper; static constexpr auto default_zookeeper_name = "default"; String zookeeper_name; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b483208028a..cbd919d5b75 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -68,7 +68,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; - extern const int CANNOT_OPEN_FILE; + extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -82,8 +82,6 @@ public: Impl(Aws::S3::S3Client & client_, const S3::URI & globbed_uri_) : client(client_), globbed_uri(globbed_uri_) { - std::lock_guard lock(mutex); - if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception("Expression can not have wildcards inside bucket name", ErrorCodes::UNEXPECTED_EXPRESSION); @@ -176,6 +174,37 @@ String StorageS3Source::DisclosedGlobIterator::next() return pimpl->next(); } +class StorageS3Source::KeysIterator::Impl +{ +public: + explicit Impl(const std::vector & keys_) : keys(keys_), keys_iter(keys.begin()) + { + } + + String next() + { + std::lock_guard lock(mutex); + if (keys_iter == keys.end()) + return ""; + auto key = *keys_iter; + ++keys_iter; + return key; + } + +private: + std::mutex mutex; + Strings keys; + Strings::iterator keys_iter; +}; + +StorageS3Source::KeysIterator::KeysIterator(const std::vector & keys_) : pimpl(std::make_shared(keys_)) +{ +} + +String StorageS3Source::KeysIterator::next() +{ + return pimpl->next(); +} Block StorageS3Source::getHeader(Block sample_block, bool with_path_column, bool with_file_column) { @@ -296,6 +325,39 @@ Chunk StorageS3Source::generate() return generate(); } +static bool checkIfObjectExists(const std::shared_ptr & client, const String & bucket, const String & key) +{ + bool is_finished = false; + Aws::S3::Model::ListObjectsV2Request request; + Aws::S3::Model::ListObjectsV2Outcome outcome; + + request.SetBucket(bucket); + request.SetPrefix(key); + while (!is_finished) + { + outcome = client->ListObjectsV2(request); + if (!outcome.IsSuccess()) + throw Exception( + ErrorCodes::S3_ERROR, + "Could not list objects in bucket {} with key {}, S3 exception: {}, message: {}", + quoteString(bucket), + quoteString(key), + backQuote(outcome.GetError().GetExceptionName()), + quoteString(outcome.GetError().GetMessage())); + + const auto & result_batch = outcome.GetResult().GetContents(); + for (const auto & obj : result_batch) + { + if (obj.GetKey() == key) + return true; + } + + request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); + is_finished = !outcome.GetResult().GetIsTruncated(); + } + + return false; +} class StorageS3Sink : public SinkToStorage { @@ -315,9 +377,6 @@ public: , sample_block(sample_block_) , format_settings(format_settings_) { - if (key.find_first_of("*?{") != std::string::npos) - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "S3 key '{}' contains globs, so the table is in readonly mode", key); - write_buf = wrapWriteBufferWithCompressionMethod( std::make_unique(client, bucket, key, min_upload_part_size, max_single_part_upload_size), compression_method, 3); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, {}, format_settings); @@ -419,7 +478,6 @@ private: std::optional format_settings; ExpressionActionsPtr partition_by_expr; - String partition_by_column_name; static void validateBucket(const String & str) { @@ -468,6 +526,7 @@ StorageS3::StorageS3( ASTPtr partition_by_) : IStorage(table_id_) , client_auth{uri_, access_key_id_, secret_access_key_, max_connections_, {}, {}} /// Client and settings will be updated later + , keys({uri_.key}) , format_name(format_name_) , max_single_read_retries(max_single_read_retries_) , min_upload_part_size(min_upload_part_size_) @@ -477,6 +536,7 @@ StorageS3::StorageS3( , distributed_processing(distributed_processing_) , format_settings(format_settings_) , partition_by(partition_by_) + , is_key_with_globs(uri_.key.find_first_of("*?{") != std::string::npos) { context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri_.uri); StorageInMemoryMetadata storage_metadata; @@ -484,7 +544,7 @@ StorageS3::StorageS3( updateClientAndAuthSettings(context_, client_auth); if (columns_.empty()) { - auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, format_settings, context_); + auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, is_key_with_globs, format_settings, context_); storage_metadata.setColumns(columns); } else @@ -495,9 +555,8 @@ StorageS3::StorageS3( setInMemoryMetadata(storage_metadata); } -std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context) +std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, const std::vector & keys, bool is_key_with_globs, bool distributed_processing, ContextPtr local_context) { - std::shared_ptr iterator_wrapper{nullptr}; if (distributed_processing) { return std::make_shared( @@ -505,13 +564,23 @@ std::shared_ptr StorageS3::createFileIterator( return callback(); }); } - - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); - return std::make_shared([glob_iterator]() + else if (is_key_with_globs) { - return glob_iterator->next(); - }); + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); + return std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); + } + else + { + auto keys_iterator = std::make_shared(keys); + return std::make_shared([keys_iterator]() + { + return keys_iterator->next(); + }); + } } Pipe StorageS3::read( @@ -536,7 +605,7 @@ Pipe StorageS3::read( need_file_column = true; } - std::shared_ptr iterator_wrapper = createFileIterator(client_auth, distributed_processing, local_context); + std::shared_ptr iterator_wrapper = createFileIterator(client_auth, keys, is_key_with_globs, distributed_processing, local_context); for (size_t i = 0; i < num_streams; ++i) { @@ -567,8 +636,8 @@ SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr updateClientAndAuthSettings(local_context, client_auth); auto sample_block = metadata_snapshot->getSampleBlock(); - auto chosen_compression_method = chooseCompressionMethod(client_auth.uri.key, compression_method); - bool has_wildcards = client_auth.uri.bucket.find(PARTITION_ID_WILDCARD) != String::npos || client_auth.uri.key.find(PARTITION_ID_WILDCARD) != String::npos; + auto chosen_compression_method = chooseCompressionMethod(keys.back(), compression_method); + bool has_wildcards = client_auth.uri.bucket.find(PARTITION_ID_WILDCARD) != String::npos || keys.back().find(PARTITION_ID_WILDCARD) != String::npos; auto insert_query = std::dynamic_pointer_cast(query); auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; @@ -585,12 +654,41 @@ SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr chosen_compression_method, client_auth.client, client_auth.uri.bucket, - client_auth.uri.key, + keys.back(), min_upload_part_size, max_single_part_upload_size); } else { + if (is_key_with_globs) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "S3 key '{}' contains globs, so the table is in readonly mode", client_auth.uri.key); + + bool truncate_in_insert = local_context->getSettingsRef().s3_truncate_on_insert; + + if (!truncate_in_insert && checkIfObjectExists(client_auth.client, client_auth.uri.bucket, keys.back())) + { + if (local_context->getSettingsRef().s3_create_new_file_on_insert) + { + size_t index = keys.size(); + auto pos = keys[0].find_first_of('.'); + String new_key; + do + { + new_key = keys[0].substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : keys[0].substr(pos)); + ++index; + } + while (checkIfObjectExists(client_auth.client, client_auth.uri.bucket, new_key)); + keys.push_back(new_key); + } + else + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Object in bucket {} with key {} already exists. If you want to overwrite it, enable setting s3_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", + client_auth.uri.bucket, + keys.back()); + } + return std::make_shared( format_name, sample_block, @@ -599,7 +697,7 @@ SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr chosen_compression_method, client_auth.client, client_auth.uri.bucket, - client_auth.uri.key, + keys.back(), min_upload_part_size, max_single_part_upload_size); } @@ -610,11 +708,17 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, { updateClientAndAuthSettings(local_context, client_auth); - Aws::S3::Model::ObjectIdentifier obj; - obj.SetKey(client_auth.uri.key); + if (is_key_with_globs) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "S3 key '{}' contains globs, so the table is in readonly mode", client_auth.uri.key); Aws::S3::Model::Delete delkeys; - delkeys.AddObjects(std::move(obj)); + + for (const auto & key : keys) + { + Aws::S3::Model::ObjectIdentifier obj; + obj.SetKey(key); + delkeys.AddObjects(std::move(obj)); + } Aws::S3::Model::DeleteObjectsRequest request; request.SetBucket(client_auth.uri.bucket); @@ -734,7 +838,7 @@ ColumnsDescription StorageS3::getTableStructureFromData( { ClientAuthentication client_auth{uri, access_key_id, secret_access_key, max_connections, {}, {}}; updateClientAndAuthSettings(ctx, client_auth); - return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, format_settings, ctx); + return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, uri.key.find_first_of("*?{") != std::string::npos, format_settings, ctx); } ColumnsDescription StorageS3::getTableStructureFromDataImpl( @@ -743,12 +847,14 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( UInt64 max_single_read_retries, const String & compression_method, bool distributed_processing, + bool is_key_with_globs, const std::optional & format_settings, ContextPtr ctx) { + std::vector keys = {client_auth.uri.key}; auto read_buffer_creator = [&]() { - auto file_iterator = createFileIterator(client_auth, distributed_processing, ctx); + auto file_iterator = createFileIterator(client_auth, keys, is_key_with_globs, distributed_processing, ctx); String current_key = (*file_iterator)(); if (current_key.empty()) throw Exception( diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index edb5e4181f9..b93040c3ee7 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -44,6 +44,18 @@ public: std::shared_ptr pimpl; }; + class KeysIterator + { + public: + explicit KeysIterator(const std::vector & keys_); + String next(); + + private: + class Impl; + /// shared_ptr to have copy constructor + std::shared_ptr pimpl; + }; + using IteratorWrapper = std::function; static Block getHeader(Block sample_block, bool with_path_column, bool with_file_column); @@ -174,6 +186,7 @@ private: }; ClientAuthentication client_auth; + std::vector keys; String format_name; UInt64 max_single_read_retries; @@ -184,10 +197,11 @@ private: const bool distributed_processing; std::optional format_settings; ASTPtr partition_by; + bool is_key_with_globs = false; static void updateClientAndAuthSettings(ContextPtr, ClientAuthentication &); - static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context); + static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, const std::vector & keys, bool is_key_with_globs, bool distributed_processing, ContextPtr local_context); static ColumnsDescription getTableStructureFromDataImpl( const String & format, @@ -195,6 +209,7 @@ private: UInt64 max_single_read_retries, const String & compression_method, bool distributed_processing, + bool is_key_with_globs, const std::optional & format_settings, ContextPtr ctx); }; diff --git a/src/Storages/StorageSQLite.cpp b/src/Storages/StorageSQLite.cpp index 636bca5d1b0..bc4e2b1dfe8 100644 --- a/src/Storages/StorageSQLite.cpp +++ b/src/Storages/StorageSQLite.cpp @@ -40,7 +40,6 @@ StorageSQLite::StorageSQLite( , WithContext(context_->getGlobalContext()) , remote_table_name(remote_table_name_) , database_path(database_path_) - , global_context(context_) , sqlite_db(sqlite_db_) , log(&Poco::Logger::get("StorageSQLite (" + table_id_.table_name + ")")) { diff --git a/src/Storages/StorageSQLite.h b/src/Storages/StorageSQLite.h index c87d3d00b47..367e6ee9e80 100644 --- a/src/Storages/StorageSQLite.h +++ b/src/Storages/StorageSQLite.h @@ -48,7 +48,6 @@ public: private: String remote_table_name; String database_path; - ContextPtr global_context; SQLitePtr sqlite_db; Poco::Logger * log; }; diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 6e288f9fa1e..192846f7f11 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -38,7 +38,7 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context { size_t total_bytes_to_read = 0; Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); - return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context); + return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context); } return parseColumnsListFromString(structure, context); diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 1df96731270..67e1c6ee85d 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -5,36 +5,66 @@ import json import logging import sys import time +from typing import Optional -import requests +import requests # type: ignore from ci_config import CI_CONFIG DOWNLOAD_RETRIES_COUNT = 5 + +def get_with_retries( + url: str, + retries: int = DOWNLOAD_RETRIES_COUNT, + sleep: int = 3, + **kwargs, +) -> requests.Response: + logging.info("Getting URL with %i and sleep %i in between: %s", retries, sleep, url) + exc = None # type: Optional[Exception] + for i in range(DOWNLOAD_RETRIES_COUNT): + try: + response = requests.get(url, **kwargs) + response.raise_for_status() + break + except Exception as e: + if i + 1 < DOWNLOAD_RETRIES_COUNT: + logging.info("Exception '%s' while getting, retry %i", e, i + 1) + time.sleep(sleep) + + exc = e + else: + raise Exception(exc) + + return response + + def get_build_name_for_check(check_name): - return CI_CONFIG['tests_config'][check_name]['required_build'] + return CI_CONFIG["tests_config"][check_name]["required_build"] + def get_build_urls(build_name, reports_path): for root, _, files in os.walk(reports_path): for f in files: - if build_name in f : + if build_name in f: logging.info("Found build report json %s", f) - with open(os.path.join(root, f), 'r', encoding='utf-8') as file_handler: + with open(os.path.join(root, f), "r", encoding="utf-8") as file_handler: build_report = json.load(file_handler) - return build_report['build_urls'] + return build_report["build_urls"] return [] + def dowload_build_with_progress(url, path): logging.info("Downloading from %s to temp path %s", url, path) for i in range(DOWNLOAD_RETRIES_COUNT): try: - with open(path, 'wb') as f: - response = requests.get(url, stream=True) - response.raise_for_status() - total_length = response.headers.get('content-length') + with open(path, "wb") as f: + response = get_with_retries(url, retries=1, stream=True) + total_length = response.headers.get("content-length") if total_length is None or int(total_length) == 0: - logging.info("No content-length, will download file without progress") + logging.info( + "No content-length, will download file without progress" + ) f.write(response.content) else: dl = 0 @@ -46,32 +76,38 @@ def dowload_build_with_progress(url, path): if sys.stdout.isatty(): done = int(50 * dl / total_length) percent = int(100 * float(dl) / total_length) - eq_str = '=' * done - space_str = ' ' * (50 - done) + eq_str = "=" * done + space_str = " " * (50 - done) sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%") sys.stdout.flush() break - except Exception as ex: - sys.stdout.write("\n") - time.sleep(3) - logging.info("Exception while downloading %s, retry %s", ex, i + 1) + except Exception: + if sys.stdout.isatty(): + sys.stdout.write("\n") + if i + 1 < DOWNLOAD_RETRIES_COUNT: + time.sleep(3) + if os.path.exists(path): os.remove(path) else: raise Exception(f"Cannot download dataset from {url}, all retries exceeded") - sys.stdout.write("\n") + if sys.stdout.isatty(): + sys.stdout.write("\n") logging.info("Downloading finished") def download_builds(result_path, build_urls, filter_fn): for url in build_urls: if filter_fn(url): - fname = os.path.basename(url.replace('%2B', '+').replace('%20', ' ')) + fname = os.path.basename(url.replace("%2B", "+").replace("%20", " ")) logging.info("Will download %s to %s", fname, result_path) dowload_build_with_progress(url, os.path.join(result_path, fname)) -def download_builds_filter(check_name, reports_path, result_path, filter_fn=lambda _: True): + +def download_builds_filter( + check_name, reports_path, result_path, filter_fn=lambda _: True +): build_name = get_build_name_for_check(check_name) urls = get_build_urls(build_name, reports_path) print(urls) @@ -81,17 +117,32 @@ def download_builds_filter(check_name, reports_path, result_path, filter_fn=lamb download_builds(result_path, urls, filter_fn) + def download_all_deb_packages(check_name, reports_path, result_path): - download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('deb')) + download_builds_filter( + check_name, reports_path, result_path, lambda x: x.endswith("deb") + ) + def download_shared_build(check_name, reports_path, result_path): - download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('shared_build.tgz')) + download_builds_filter( + check_name, reports_path, result_path, lambda x: x.endswith("shared_build.tgz") + ) + def download_unit_tests(check_name, reports_path, result_path): - download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('unit_tests_dbms')) + download_builds_filter( + check_name, reports_path, result_path, lambda x: x.endswith("unit_tests_dbms") + ) + def download_clickhouse_binary(check_name, reports_path, result_path): - download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('clickhouse')) + download_builds_filter( + check_name, reports_path, result_path, lambda x: x.endswith("clickhouse") + ) + def download_performance_build(check_name, reports_path, result_path): - download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('performance.tgz')) + download_builds_filter( + check_name, reports_path, result_path, lambda x: x.endswith("performance.tgz") + ) diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/app.py b/tests/ci/cancel_and_rerun_workflow_lambda/app.py index b79eb292dc6..ab5702569d4 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/app.py +++ b/tests/ci/cancel_and_rerun_workflow_lambda/app.py @@ -5,22 +5,23 @@ import json import time import jwt -import requests -import boto3 +import requests # type: ignore +import boto3 # type: ignore NEED_RERUN_OR_CANCELL_WORKFLOWS = { - 13241696, # PR - 15834118, # Docs - 15516108, # ReleaseCI - 15797242, # BackportPR + "PullRequestCI", + "Docs", + "DocsRelease", + "BackportPR", } # https://docs.github.com/en/rest/reference/actions#cancel-a-workflow-run # -API_URL = 'https://api.github.com/repos/ClickHouse/ClickHouse' +API_URL = "https://api.github.com/repos/ClickHouse/ClickHouse" MAX_RETRY = 5 + def get_installation_id(jwt_token): headers = { "Authorization": f"Bearer {jwt_token}", @@ -29,29 +30,33 @@ def get_installation_id(jwt_token): response = requests.get("https://api.github.com/app/installations", headers=headers) response.raise_for_status() data = response.json() - return data[0]['id'] + return data[0]["id"] + def get_access_token(jwt_token, installation_id): headers = { "Authorization": f"Bearer {jwt_token}", "Accept": "application/vnd.github.v3+json", } - response = requests.post(f"https://api.github.com/app/installations/{installation_id}/access_tokens", headers=headers) + response = requests.post( + f"https://api.github.com/app/installations/{installation_id}/access_tokens", + headers=headers, + ) response.raise_for_status() data = response.json() - return data['token'] + return data["token"] + def get_key_and_app_from_aws(): secret_name = "clickhouse_github_secret_key" session = boto3.session.Session() client = session.client( - service_name='secretsmanager', + service_name="secretsmanager", ) - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) - data = json.loads(get_secret_value_response['SecretString']) - return data['clickhouse-app-key'], int(data['clickhouse-app-id']) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + data = json.loads(get_secret_value_response["SecretString"]) + return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) + def get_token_from_aws(): private_key, app_id = get_key_and_app_from_aws() @@ -65,6 +70,7 @@ def get_token_from_aws(): installation_id = get_installation_id(encoded_jwt) return get_access_token(encoded_jwt, installation_id) + def _exec_get_with_retry(url): for i in range(MAX_RETRY): try: @@ -78,20 +84,25 @@ def _exec_get_with_retry(url): raise Exception("Cannot execute GET request with retries") -WorkflowDescription = namedtuple('WorkflowDescription', - ['run_id', 'status', 'rerun_url', 'cancel_url']) +WorkflowDescription = namedtuple( + "WorkflowDescription", ["run_id", "status", "rerun_url", "cancel_url"] +) def get_workflows_description_for_pull_request(pull_request_event): - head_branch = pull_request_event['head']['ref'] - print("PR", pull_request_event['number'], "has head ref", head_branch) + head_branch = pull_request_event["head"]["ref"] + print("PR", pull_request_event["number"], "has head ref", head_branch) workflows_data = [] - workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1") - workflows_data += workflows['workflow_runs'] + workflows = _exec_get_with_retry( + API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1" + ) + workflows_data += workflows["workflow_runs"] i = 2 - while len(workflows['workflow_runs']) > 0: - workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}") - workflows_data += workflows['workflow_runs'] + while len(workflows["workflow_runs"]) > 0: + workflows = _exec_get_with_retry( + API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}" + ) + workflows_data += workflows["workflow_runs"] i += 1 if i > 30: print("Too many workflows found") @@ -99,29 +110,37 @@ def get_workflows_description_for_pull_request(pull_request_event): workflow_descriptions = [] for workflow in workflows_data: - # unfortunately we cannot filter workflows from forks in request to API so doing it manually - if (workflow['head_repository']['full_name'] == pull_request_event['head']['repo']['full_name'] - and workflow['workflow_id'] in NEED_RERUN_OR_CANCELL_WORKFLOWS): - workflow_descriptions.append(WorkflowDescription( - run_id=workflow['id'], - status=workflow['status'], - rerun_url=workflow['rerun_url'], - cancel_url=workflow['cancel_url'])) + # unfortunately we cannot filter workflows from forks in request to API + # so doing it manually + if ( + workflow["head_repository"]["full_name"] + == pull_request_event["head"]["repo"]["full_name"] + and workflow["name"] in NEED_RERUN_OR_CANCELL_WORKFLOWS + ): + workflow_descriptions.append( + WorkflowDescription( + run_id=workflow["id"], + status=workflow["status"], + rerun_url=workflow["rerun_url"], + cancel_url=workflow["cancel_url"], + ) + ) return workflow_descriptions + def get_workflow_description(workflow_id): workflow = _exec_get_with_retry(API_URL + f"/actions/runs/{workflow_id}") return WorkflowDescription( - run_id=workflow['id'], - status=workflow['status'], - rerun_url=workflow['rerun_url'], - cancel_url=workflow['cancel_url']) + run_id=workflow["id"], + status=workflow["status"], + rerun_url=workflow["rerun_url"], + cancel_url=workflow["cancel_url"], + ) + def _exec_post_with_retry(url, token): - headers = { - "Authorization": f"token {token}" - } + headers = {"Authorization": f"token {token}"} for i in range(MAX_RETRY): try: response = requests.post(url, headers=headers) @@ -133,32 +152,34 @@ def _exec_post_with_retry(url, token): raise Exception("Cannot execute POST request with retry") + def exec_workflow_url(urls_to_cancel, token): for url in urls_to_cancel: print("Post for workflow workflow using url", url) _exec_post_with_retry(url, token) print("Workflow post finished") + def main(event): token = get_token_from_aws() - event_data = json.loads(event['body']) + event_data = json.loads(event["body"]) - print("Got event for PR", event_data['number']) - action = event_data['action'] - print("Got action", event_data['action']) - pull_request = event_data['pull_request'] - labels = { l['name'] for l in pull_request['labels'] } + print("Got event for PR", event_data["number"]) + action = event_data["action"] + print("Got action", event_data["action"]) + pull_request = event_data["pull_request"] + labels = {label["name"] for label in pull_request["labels"]} print("PR has labels", labels) - if action == 'closed' or 'do not test' in labels: + if action == "closed" or "do not test" in labels: print("PR merged/closed or manually labeled 'do not test' will kill workflows") workflow_descriptions = get_workflows_description_for_pull_request(pull_request) urls_to_cancel = [] for workflow_description in workflow_descriptions: - if workflow_description.status != 'completed': + if workflow_description.status != "completed": urls_to_cancel.append(workflow_description.cancel_url) print(f"Found {len(urls_to_cancel)} workflows to cancel") exec_workflow_url(urls_to_cancel, token) - elif action == 'labeled' and 'can be tested' in labels: + elif action == "labeled" and "can be tested" in labels: print("PR marked with can be tested label, rerun workflow") workflow_descriptions = get_workflows_description_for_pull_request(pull_request) if not workflow_descriptions: @@ -168,7 +189,7 @@ def main(event): sorted_workflows = list(sorted(workflow_descriptions, key=lambda x: x.run_id)) most_recent_workflow = sorted_workflows[-1] print("Latest workflow", most_recent_workflow) - if most_recent_workflow.status != 'completed': + if most_recent_workflow.status != "completed": print("Latest workflow is not completed, cancelling") exec_workflow_url([most_recent_workflow.cancel_url], token) print("Cancelled") @@ -176,7 +197,7 @@ def main(event): for _ in range(30): latest_workflow_desc = get_workflow_description(most_recent_workflow.run_id) print("Checking latest workflow", latest_workflow_desc) - if latest_workflow_desc.status in ('completed', 'cancelled'): + if latest_workflow_desc.status in ("completed", "cancelled"): print("Finally latest workflow done, going to rerun") exec_workflow_url([most_recent_workflow.rerun_url], token) print("Rerun finished, exiting") @@ -187,5 +208,6 @@ def main(event): else: print("Nothing to do") + def handler(event, _): main(event) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index d698d18a58b..a908f5fe11c 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -6,7 +6,7 @@ import os import shutil import subprocess import time -from typing import List, Tuple +from typing import List, Optional, Set, Tuple, Union from github import Github @@ -24,13 +24,54 @@ NAME = "Push to Dockerhub (actions)" TEMP_PATH = os.path.join(RUNNER_TEMP, "docker_images_check") +class DockerImage: + def __init__( + self, + path: str, + repo: str, + parent: Optional["DockerImage"] = None, + gh_repo_path: str = GITHUB_WORKSPACE, + ): + self.path = path + self.full_path = os.path.join(gh_repo_path, path) + self.repo = repo + self.parent = parent + self.built = False + + def __eq__(self, other) -> bool: # type: ignore + """Is used to check if DockerImage is in a set or not""" + return self.path == other.path and self.repo == self.repo + + def __lt__(self, other) -> bool: + if not isinstance(other, DockerImage): + return False + if self.parent and not other.parent: + return False + if not self.parent and other.parent: + return True + if self.path < other.path: + return True + if self.repo < other.repo: + return True + return False + + def __hash__(self): + return hash(self.path) + + def __str__(self): + return self.repo + + def __repr__(self): + return f"DockerImage(path={self.path},repo={self.repo},parent={self.parent})" + + def get_changed_docker_images( pr_info: PRInfo, repo_path: str, image_file_path: str -) -> List[Tuple[str, str]]: +) -> Set[DockerImage]: images_dict = {} path_to_images_file = os.path.join(repo_path, image_file_path) if os.path.exists(path_to_images_file): - with open(path_to_images_file, "r") as dict_file: + with open(path_to_images_file, "rb") as dict_file: images_dict = json.load(dict_file) else: logging.info( @@ -38,7 +79,7 @@ def get_changed_docker_images( ) if not images_dict: - return [] + return set() files_changed = pr_info.changed_files @@ -54,14 +95,15 @@ def get_changed_docker_images( for dockerfile_dir, image_description in images_dict.items(): for f in files_changed: if f.startswith(dockerfile_dir): + name = image_description["name"] logging.info( "Found changed file '%s' which affects " "docker image '%s' with path '%s'", f, - image_description["name"], + name, dockerfile_dir, ) - changed_images.append(dockerfile_dir) + changed_images.append(DockerImage(dockerfile_dir, name)) break # The order is important: dependents should go later than bases, so that @@ -69,14 +111,16 @@ def get_changed_docker_images( index = 0 while index < len(changed_images): image = changed_images[index] - for dependent in images_dict[image]["dependent"]: + for dependent in images_dict[image.path]["dependent"]: logging.info( "Marking docker image '%s' as changed because it " "depends on changed docker image '%s'", dependent, image, ) - changed_images.append(dependent) + changed_images.append( + DockerImage(dependent, images_dict[dependent]["name"], image) + ) index += 1 if index > 5 * len(images_dict): # Sanity check to prevent infinite loop. @@ -84,19 +128,9 @@ def get_changed_docker_images( f"Too many changed docker images, this is a bug. {changed_images}" ) - # If a dependent image was already in the list because its own files - # changed, but then it was added as a dependent of a changed base, we - # must remove the earlier entry so that it doesn't go earlier than its - # base. This way, the dependent will be rebuilt later than the base, and - # will correctly use the updated version of the base. - seen = set() - no_dups_reversed = [] - for x in reversed(changed_images): - if x not in seen: - seen.add(x) - no_dups_reversed.append(x) - - result = [(x, images_dict[x]["name"]) for x in reversed(no_dups_reversed)] + # With reversed changed_images set will use images with parents first, and + # images without parents then + result = set(reversed(changed_images)) logging.info( "Changed docker images for PR %s @ %s: '%s'", pr_info.number, @@ -106,66 +140,109 @@ def get_changed_docker_images( return result +def gen_versions( + pr_info: PRInfo, suffix: Optional[str] +) -> Tuple[List[str], Union[str, List[str]]]: + pr_commit_version = str(pr_info.number) + "-" + pr_info.sha + # The order is important, PR number is used as cache during the build + versions = [str(pr_info.number), pr_commit_version] + result_version = pr_commit_version + if pr_info.number == 0: + # First get the latest for cache + versions.insert(0, "latest") + + if suffix: + # We should build architecture specific images separately and merge a + # manifest lately in a different script + versions = [f"{v}-{suffix}" for v in versions] + # changed_images_{suffix}.json should contain all changed images + result_version = versions + + return versions, result_version + + def build_and_push_one_image( - path_to_dockerfile_folder: str, image_name: str, version_string: str, push: bool + image: DockerImage, + version_string: str, + push: bool, + child: bool, ) -> Tuple[bool, str]: - path = path_to_dockerfile_folder logging.info( "Building docker image %s with version %s from path %s", - image_name, + image.repo, version_string, - path, + image.full_path, ) build_log = os.path.join( - TEMP_PATH, - "build_and_push_log_{}_{}".format( - str(image_name).replace("/", "_"), version_string - ), + TEMP_PATH, f"build_and_push_log_{image.repo.replace('/', '_')}_{version_string}" ) push_arg = "" if push: push_arg = "--push " - with open(build_log, "w") as bl: + from_tag_arg = "" + if child: + from_tag_arg = f"--build-arg FROM_TAG={version_string} " + + with open(build_log, "wb") as bl: cmd = ( "docker buildx build --builder default " - f"--build-arg FROM_TAG={version_string} " + f"{from_tag_arg}" f"--build-arg BUILDKIT_INLINE_CACHE=1 " - f"--tag {image_name}:{version_string} " - f"--cache-from type=registry,ref={image_name}:{version_string} " + f"--tag {image.repo}:{version_string} " + f"--cache-from type=registry,ref={image.repo}:{version_string} " f"{push_arg}" - f"--progress plain {path}" + f"--progress plain {image.full_path}" ) logging.info("Docker command to run: %s", cmd) - retcode = subprocess.Popen(cmd, shell=True, stderr=bl, stdout=bl).wait() + with subprocess.Popen(cmd, shell=True, stderr=bl, stdout=bl) as proc: + retcode = proc.wait() + if retcode != 0: return False, build_log - logging.info("Processing of %s successfully finished", image_name) + logging.info("Processing of %s successfully finished", image.repo) return True, build_log def process_single_image( - versions: List[str], path_to_dockerfile_folder: str, image_name: str, push: bool + image: DockerImage, + versions: List[str], + push: bool, + child: bool, ) -> List[Tuple[str, str, str]]: logging.info("Image will be pushed with versions %s", ", ".join(versions)) result = [] for ver in versions: for i in range(5): - success, build_log = build_and_push_one_image( - path_to_dockerfile_folder, image_name, ver, push - ) + success, build_log = build_and_push_one_image(image, ver, push, child) if success: - result.append((image_name + ":" + ver, build_log, "OK")) + result.append((image.repo + ":" + ver, build_log, "OK")) break logging.info( "Got error will retry %s time and sleep for %s seconds", i, i * 5 ) time.sleep(i * 5) else: - result.append((image_name + ":" + ver, build_log, "FAIL")) + result.append((image.repo + ":" + ver, build_log, "FAIL")) logging.info("Processing finished") + image.built = True + return result + + +def process_image_with_parents( + image: DockerImage, versions: List[str], push: bool, child: bool = False +) -> List[Tuple[str, str, str]]: + result = [] # type: List[Tuple[str,str,str]] + if image.built: + return result + + if image.parent is not None: + result += process_image_with_parents(image.parent, versions, push, False) + child = True + + result += process_single_image(image, versions, push, child) return result @@ -182,7 +259,7 @@ def process_test_results( build_url = s3_client.upload_test_report_to_s3( build_log, s3_path_prefix + "/" + os.path.basename(build_log) ) - url_part += 'build_log'.format(build_url) + url_part += f'build_log' if url_part: test_name = image + " (" + url_part + ")" else: @@ -255,8 +332,6 @@ def main(): shell=True, ) - repo_path = GITHUB_WORKSPACE - if os.path.exists(TEMP_PATH): shutil.rmtree(TEMP_PATH) os.makedirs(TEMP_PATH) @@ -267,43 +342,30 @@ def main(): else: pr_info = PRInfo(need_changed_files=True) - changed_images = get_changed_docker_images(pr_info, repo_path, "docker/images.json") - logging.info( - "Has changed images %s", ", ".join([str(image[0]) for image in changed_images]) + changed_images = get_changed_docker_images( + pr_info, GITHUB_WORKSPACE, "docker/images.json" ) - pr_commit_version = str(pr_info.number) + "-" + pr_info.sha - # The order is important, PR number is used as cache during the build - versions = [str(pr_info.number), pr_commit_version] - result_version = pr_commit_version - if pr_info.number == 0: - # First get the latest for cache - versions.insert(0, "latest") + logging.info("Has changed images %s", ", ".join([im.path for im in changed_images])) - if args.suffix: - # We should build architecture specific images separately and merge a - # manifest lately in a different script - versions = [f"{v}-{args.suffix}" for v in versions] - # changed_images_{suffix}.json should contain all changed images - result_version = versions + image_versions, result_version = gen_versions(pr_info, args.suffix) result_images = {} images_processing_result = [] - for rel_path, image_name in changed_images: - full_path = os.path.join(repo_path, rel_path) - images_processing_result += process_single_image( - versions, full_path, image_name, push + for image in changed_images: + images_processing_result += process_image_with_parents( + image, image_versions, push ) - result_images[image_name] = result_version + result_images[image.repo] = result_version if changed_images: - description = "Updated " + ",".join([im[1] for im in changed_images]) + description = "Updated " + ",".join([im.repo for im in changed_images]) else: description = "Nothing to update" if len(description) >= 140: description = description[:136] + "..." - with open(changed_json, "w") as images_file: + with open(changed_json, "w", encoding="utf-8") as images_file: json.dump(result_images, images_file) s3_helper = S3Helper("https://s3.amazonaws.com") @@ -317,8 +379,8 @@ def main(): url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [], NAME) - print("::notice ::Report url: {}".format(url)) - print('::set-output name=url_output::"{}"'.format(url)) + print(f"::notice ::Report url: {url}") + print(f'::set-output name=url_output::"{url}"') if args.no_reports: return diff --git a/tests/ci/docker_test.py b/tests/ci/docker_test.py new file mode 100644 index 00000000000..4392641b215 --- /dev/null +++ b/tests/ci/docker_test.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python + +import os +import unittest +from unittest.mock import patch + +from pr_info import PRInfo +import docker_images_check as di + +# di.logging.basicConfig(level=di.logging.INFO) + + +class TestDockerImageCheck(unittest.TestCase): + docker_images_path = os.path.join( + os.path.dirname(__file__), "tests/docker_images.json" + ) + + def test_get_changed_docker_images(self): + pr_info = PRInfo(PRInfo.default_event.copy()) + pr_info.changed_files = { + "docker/test/stateless", + "docker/test/base", + "docker/docs/builder", + } + images = sorted( + list(di.get_changed_docker_images(pr_info, "/", self.docker_images_path)) + ) + self.maxDiff = None + expected = sorted( + [ + di.DockerImage("docker/test/base", "clickhouse/test-base"), + di.DockerImage("docker/docs/builder", "clickhouse/docs-builder"), + di.DockerImage( + "docker/test/stateless", + "clickhouse/stateless-test", + "clickhouse/test-base", + ), + di.DockerImage( + "docker/test/integration/base", + "clickhouse/integration-test", + "clickhouse/test-base", + ), + di.DockerImage( + "docker/test/fuzzer", "clickhouse/fuzzer", "clickhouse/test-base" + ), + di.DockerImage( + "docker/test/keeper-jepsen", + "clickhouse/keeper-jepsen-test", + "clickhouse/test-base", + ), + di.DockerImage( + "docker/docs/check", + "clickhouse/docs-check", + "clickhouse/docs-builder", + ), + di.DockerImage( + "docker/docs/release", + "clickhouse/docs-release", + "clickhouse/docs-builder", + ), + di.DockerImage( + "docker/test/stateful", + "clickhouse/stateful-test", + "clickhouse/stateless-test", + ), + di.DockerImage( + "docker/test/unit", + "clickhouse/unit-test", + "clickhouse/stateless-test", + ), + di.DockerImage( + "docker/test/stress", + "clickhouse/stress-test", + "clickhouse/stateful-test", + ), + ] + ) + self.assertEqual(images, expected) + + def test_gen_version(self): + pr_info = PRInfo(PRInfo.default_event.copy()) + versions, result_version = di.gen_versions(pr_info, None) + self.assertEqual(versions, ["latest", "0", "0-HEAD"]) + self.assertEqual(result_version, "0-HEAD") + versions, result_version = di.gen_versions(pr_info, "suffix") + self.assertEqual(versions, ["latest-suffix", "0-suffix", "0-HEAD-suffix"]) + self.assertEqual(result_version, versions) + pr_info.number = 1 + versions, result_version = di.gen_versions(pr_info, None) + self.assertEqual(versions, ["1", "1-HEAD"]) + self.assertEqual(result_version, "1-HEAD") + + @patch("builtins.open") + @patch("subprocess.Popen") + def test_build_and_push_one_image(self, mock_popen, mock_open): + mock_popen.return_value.__enter__.return_value.wait.return_value = 0 + image = di.DockerImage("path", "name", gh_repo_path="") + + result, _ = di.build_and_push_one_image(image, "version", True, True) + mock_open.assert_called_once() + mock_popen.assert_called_once() + self.assertIn( + "docker buildx build --builder default --build-arg FROM_TAG=version " + "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version --cache-from " + "type=registry,ref=name:version --push --progress plain path", + mock_popen.call_args.args, + ) + self.assertTrue(result) + + mock_open.reset() + mock_popen.reset() + mock_popen.return_value.__enter__.return_value.wait.return_value = 0 + result, _ = di.build_and_push_one_image(image, "version2", False, True) + self.assertIn( + "docker buildx build --builder default --build-arg FROM_TAG=version2 " + "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version2 --cache-from " + "type=registry,ref=name:version2 --progress plain path", + mock_popen.call_args.args, + ) + self.assertTrue(result) + + mock_popen.return_value.__enter__.return_value.wait.return_value = 1 + result, _ = di.build_and_push_one_image(image, "version2", False, False) + self.assertIn( + "docker buildx build --builder default " + "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version2 --cache-from " + "type=registry,ref=name:version2 --progress plain path", + mock_popen.call_args.args, + ) + self.assertFalse(result) + + @patch("docker_images_check.build_and_push_one_image") + def test_process_image_with_parents(self, mock_build): + mock_build.side_effect = lambda w, x, y, z: (True, f"{w.repo}_{x}.log") + im1 = di.DockerImage("path1", "repo1") + im2 = di.DockerImage("path2", "repo2", im1) + im3 = di.DockerImage("path3", "repo3", im2) + im4 = di.DockerImage("path4", "repo4", im1) + # We use list to have determined order of image builgings + images = [im4, im1, im3, im2, im1] + results = [ + di.process_image_with_parents(im, ["v1", "v2", "latest"], True) + for im in images + ] + + expected = [ + [ # repo4 -> repo1 + ("repo1:v1", "repo1_v1.log", "OK"), + ("repo1:v2", "repo1_v2.log", "OK"), + ("repo1:latest", "repo1_latest.log", "OK"), + ("repo4:v1", "repo4_v1.log", "OK"), + ("repo4:v2", "repo4_v2.log", "OK"), + ("repo4:latest", "repo4_latest.log", "OK"), + ], + [], # repo1 is built + [ # repo3 -> repo2 -> repo1 + ("repo2:v1", "repo2_v1.log", "OK"), + ("repo2:v2", "repo2_v2.log", "OK"), + ("repo2:latest", "repo2_latest.log", "OK"), + ("repo3:v1", "repo3_v1.log", "OK"), + ("repo3:v2", "repo3_v2.log", "OK"), + ("repo3:latest", "repo3_latest.log", "OK"), + ], + [], # repo2 -> repo1 are built + [], # repo1 is built + ] + self.assertEqual(results, expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ci/metrics_lambda/app.py b/tests/ci/metrics_lambda/app.py index 1b63af30b59..0d84e7690a8 100644 --- a/tests/ci/metrics_lambda/app.py +++ b/tests/ci/metrics_lambda/app.py @@ -174,6 +174,7 @@ def group_runners_by_tag(listed_runners): "fuzzer-unit-tester", "stress-tester", "style-checker", + "style-checker-aarch64", ] for runner in listed_runners: for tag in runner.tags: diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index 90c5034bfa7..ea2f3c5196a 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -17,6 +17,7 @@ from get_robot_token import get_best_robot_token from docker_pull_helper import get_image_with_version from commit_status_helper import get_commit, post_commit_status from tee_popen import TeePopen +from rerun_helper import RerunHelper IMAGE_NAME = 'clickhouse/performance-comparison' @@ -82,6 +83,11 @@ if __name__ == "__main__": else: check_name_with_group = check_name + rerun_helper = RerunHelper(gh, pr_info, check_name_with_group) + if rerun_helper.is_already_finished_by_status(): + logging.info("Check is already finished according to github status, exiting") + sys.exit(0) + docker_image = get_image_with_version(reports_path, IMAGE_NAME) #with RamDrive(ramdrive_path, ramdrive_size): diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index a155786d815..1a6ed5645de 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -2,26 +2,51 @@ import json import os -import requests # type: ignore from unidiff import PatchSet # type: ignore -from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL, GITHUB_RUN_ID, GITHUB_EVENT_PATH +from build_download_helper import get_with_retries +from env_helper import ( + GITHUB_REPOSITORY, + GITHUB_SERVER_URL, + GITHUB_RUN_ID, + GITHUB_EVENT_PATH, +) + +DIFF_IN_DOCUMENTATION_EXT = [ + ".html", + ".md", + ".yml", + ".txt", + ".css", + ".js", + ".xml", + ".ico", + ".conf", + ".svg", + ".png", + ".jpg", + ".py", + ".sh", + ".json", +] +RETRY_SLEEP = 0 -DIFF_IN_DOCUMENTATION_EXT = [".html", ".md", ".yml", ".txt", ".css", ".js", ".xml", ".ico", ".conf", ".svg", ".png", - ".jpg", ".py", ".sh", ".json"] def get_pr_for_commit(sha, ref): - try_get_pr_url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls" + if not ref: + return None + try_get_pr_url = ( + f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls" + ) try: - response = requests.get(try_get_pr_url) - response.raise_for_status() + response = get_with_retries(try_get_pr_url, sleep=RETRY_SLEEP) data = response.json() if len(data) > 1: print("Got more than one pr for commit", sha) for pr in data: # refs for pushes looks like refs/head/XX # refs for RPs looks like XX - if pr['head']['ref'] in ref: + if pr["head"]["ref"] in ref: return pr print("Cannot find PR with required ref", ref, "returning first one") first_pr = data[0] @@ -32,38 +57,58 @@ def get_pr_for_commit(sha, ref): class PRInfo: - def __init__(self, github_event=None, need_orgs=False, need_changed_files=False, labels_from_api=False): + default_event = { + "commits": 1, + "before": "HEAD~", + "after": "HEAD", + "ref": None, + } + + def __init__( + self, + github_event=None, + need_orgs=False, + need_changed_files=False, + pr_event_from_api=False, + ): if not github_event: if GITHUB_EVENT_PATH: - with open(GITHUB_EVENT_PATH, 'r', encoding='utf-8') as event_file: + with open(GITHUB_EVENT_PATH, "r", encoding="utf-8") as event_file: github_event = json.load(event_file) else: - github_event = { - 'commits': 1, - 'before': 'HEAD~', - 'after': 'HEAD', - 'ref': None, - } + github_event = PRInfo.default_event.copy() self.event = github_event self.changed_files = set([]) self.body = "" ref = github_event.get("ref", "refs/head/master") - if ref.startswith('refs/heads/'): + if ref and ref.startswith("refs/heads/"): ref = ref[11:] # workflow completed event, used for PRs only - if 'action' in github_event and github_event['action'] == 'completed': - self.sha = github_event['workflow_run']['head_sha'] - prs_for_sha = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}/pulls").json() + if "action" in github_event and github_event["action"] == "completed": + self.sha = github_event["workflow_run"]["head_sha"] + prs_for_sha = get_with_retries( + f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}" + "/pulls", + sleep=RETRY_SLEEP, + ).json() if len(prs_for_sha) != 0: - github_event['pull_request'] = prs_for_sha[0] + github_event["pull_request"] = prs_for_sha[0] - if 'pull_request' in github_event: # pull request and other similar events - self.number = github_event['pull_request']['number'] - if 'after' in github_event: - self.sha = github_event['after'] + if "pull_request" in github_event: # pull request and other similar events + self.number = github_event["pull_request"]["number"] + if pr_event_from_api: + response = get_with_retries( + f"https://api.github.com/repos/{GITHUB_REPOSITORY}" + f"/pulls/{self.number}", + sleep=RETRY_SLEEP, + ) + github_event["pull_request"] = response.json() + + if "after" in github_event: + self.sha = github_event["after"] else: - self.sha = github_event['pull_request']['head']['sha'] + self.sha = github_event["pull_request"]["head"]["sha"] repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" @@ -72,35 +117,35 @@ class PRInfo: self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.pr_html_url = f"{repo_prefix}/pull/{self.number}" - self.base_ref = github_event['pull_request']['base']['ref'] - self.base_name = github_event['pull_request']['base']['repo']['full_name'] - self.head_ref = github_event['pull_request']['head']['ref'] - self.head_name = github_event['pull_request']['head']['repo']['full_name'] - self.body = github_event['pull_request']['body'] + self.base_ref = github_event["pull_request"]["base"]["ref"] + self.base_name = github_event["pull_request"]["base"]["repo"]["full_name"] + self.head_ref = github_event["pull_request"]["head"]["ref"] + self.head_name = github_event["pull_request"]["head"]["repo"]["full_name"] + self.body = github_event["pull_request"]["body"] + self.labels = { + label["name"] for label in github_event["pull_request"]["labels"] + } - if labels_from_api: - response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels") - self.labels = {l['name'] for l in response.json()} - else: - self.labels = {l['name'] for l in github_event['pull_request']['labels']} - - self.user_login = github_event['pull_request']['user']['login'] + self.user_login = github_event["pull_request"]["user"]["login"] self.user_orgs = set([]) if need_orgs: - user_orgs_response = requests.get(github_event['pull_request']['user']['organizations_url']) + user_orgs_response = get_with_retries( + github_event["pull_request"]["user"]["organizations_url"], + sleep=RETRY_SLEEP, + ) if user_orgs_response.ok: response_json = user_orgs_response.json() - self.user_orgs = set(org['id'] for org in response_json) + self.user_orgs = set(org["id"] for org in response_json) - self.diff_url = github_event['pull_request']['diff_url'] - elif 'commits' in github_event: - self.sha = github_event['after'] - pull_request = get_pr_for_commit(self.sha, github_event['ref']) + self.diff_url = github_event["pull_request"]["diff_url"] + elif "commits" in github_event: + self.sha = github_event["after"] + pull_request = get_pr_for_commit(self.sha, github_event["ref"]) repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.repo_full_name = GITHUB_REPOSITORY - if pull_request is None or pull_request['state'] == 'closed': + if pull_request is None or pull_request["state"] == "closed": # it's merged PR to master self.number = 0 self.labels = {} @@ -109,25 +154,25 @@ class PRInfo: self.base_name = self.repo_full_name self.head_ref = ref self.head_name = self.repo_full_name - self.diff_url = \ - f"https://api.github.com/repos/{GITHUB_REPOSITORY}/compare/{github_event['before']}...{self.sha}" + self.diff_url = ( + f"https://api.github.com/repos/{GITHUB_REPOSITORY}/" + f"compare/{github_event['before']}...{self.sha}" + ) else: - self.number = pull_request['number'] - if labels_from_api: - response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels") - self.labels = {l['name'] for l in response.json()} - else: - self.labels = {l['name'] for l in pull_request['labels']} + self.labels = {label["name"] for label in pull_request["labels"]} - self.base_ref = pull_request['base']['ref'] - self.base_name = pull_request['base']['repo']['full_name'] - self.head_ref = pull_request['head']['ref'] - self.head_name = pull_request['head']['repo']['full_name'] - self.pr_html_url = pull_request['html_url'] - if 'pr-backport' in self.labels: - self.diff_url = f"https://github.com/{GITHUB_REPOSITORY}/compare/master...{self.head_ref}.diff" + self.base_ref = pull_request["base"]["ref"] + self.base_name = pull_request["base"]["repo"]["full_name"] + self.head_ref = pull_request["head"]["ref"] + self.head_name = pull_request["head"]["repo"]["full_name"] + self.pr_html_url = pull_request["html_url"] + if "pr-backport" in self.labels: + self.diff_url = ( + f"https://github.com/{GITHUB_REPOSITORY}/" + f"compare/master...{self.head_ref}.diff" + ) else: - self.diff_url = pull_request['diff_url'] + self.diff_url = pull_request["diff_url"] else: print(json.dumps(github_event, sort_keys=True, indent=4)) self.sha = os.getenv("GITHUB_SHA") @@ -150,24 +195,27 @@ class PRInfo: if not self.diff_url: raise Exception("Diff URL cannot be find for event") - response = requests.get(self.diff_url) + response = get_with_retries( + self.diff_url, + sleep=RETRY_SLEEP, + ) response.raise_for_status() - if 'commits' in self.event and self.number == 0: + if "commits" in self.event and self.number == 0: diff = response.json() - if 'files' in diff: - self.changed_files = [f['filename'] for f in diff['files']] + if "files" in diff: + self.changed_files = [f["filename"] for f in diff["files"]] else: diff_object = PatchSet(response.text) self.changed_files = {f.path for f in diff_object} def get_dict(self): return { - 'sha': self.sha, - 'number': self.number, - 'labels': self.labels, - 'user_login': self.user_login, - 'user_orgs': self.user_orgs, + "sha": self.sha, + "number": self.number, + "labels": self.labels, + "user_login": self.user_login, + "user_orgs": self.user_orgs, } def has_changes_in_documentation(self): @@ -178,49 +226,63 @@ class PRInfo: for f in self.changed_files: _, ext = os.path.splitext(f) - path_in_docs = 'docs' in f - path_in_website = 'website' in f - if (ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website)) or 'docker/docs' in f: + path_in_docs = "docs" in f + path_in_website = "website" in f + if ( + ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website) + ) or "docker/docs" in f: return True return False def can_skip_builds_and_use_version_from_master(self): - if 'force tests' in self.labels: + # TODO: See a broken loop + if "force tests" in self.labels: return False if self.changed_files is None or not self.changed_files: return False for f in self.changed_files: - if (not f.startswith('tests/queries') - or not f.startswith('tests/integration') - or not f.startswith('tests/performance')): + # TODO: this logic is broken, should be fixed before using + if ( + not f.startswith("tests/queries") + or not f.startswith("tests/integration") + or not f.startswith("tests/performance") + ): return False return True def can_skip_integration_tests(self): - if 'force tests' in self.labels: + # TODO: See a broken loop + if "force tests" in self.labels: return False if self.changed_files is None or not self.changed_files: return False for f in self.changed_files: - if not f.startswith('tests/queries') or not f.startswith('tests/performance'): + # TODO: this logic is broken, should be fixed before using + if not f.startswith("tests/queries") or not f.startswith( + "tests/performance" + ): return False return True def can_skip_functional_tests(self): - if 'force tests' in self.labels: + # TODO: See a broken loop + if "force tests" in self.labels: return False if self.changed_files is None or not self.changed_files: return False for f in self.changed_files: - if not f.startswith('tests/integration') or not f.startswith('tests/performance'): + # TODO: this logic is broken, should be fixed before using + if not f.startswith("tests/integration") or not f.startswith( + "tests/performance" + ): return False return True diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 3e93d240256..452824d58be 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -204,7 +204,7 @@ def check_pr_description(pr_info): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - pr_info = PRInfo(need_orgs=True, labels_from_api=True) + pr_info = PRInfo(need_orgs=True, pr_event_from_api=True) can_run, description = should_run_checks_for_pr(pr_info) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) @@ -212,6 +212,9 @@ if __name__ == "__main__": description_report = check_pr_description(pr_info)[:139] if description_report: print("::notice ::Cannot run, description does not match the template") + logging.info( + "PR body doesn't match the template: (start)\n%s\n(end)", pr_info.body + ) url = ( f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" "blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1" diff --git a/tests/ci/tests/docker_images.json b/tests/ci/tests/docker_images.json new file mode 100644 index 00000000000..354bdaa8728 --- /dev/null +++ b/tests/ci/tests/docker_images.json @@ -0,0 +1,166 @@ +{ + "docker/packager/deb": { + "name": "clickhouse/deb-builder", + "dependent": [] + }, + "docker/packager/binary": { + "name": "clickhouse/binary-builder", + "dependent": [ + "docker/test/split_build_smoke_test", + "docker/test/pvs", + "docker/test/codebrowser" + ] + }, + "docker/test/compatibility/centos": { + "name": "clickhouse/test-old-centos", + "dependent": [] + }, + "docker/test/compatibility/ubuntu": { + "name": "clickhouse/test-old-ubuntu", + "dependent": [] + }, + "docker/test/integration/base": { + "name": "clickhouse/integration-test", + "dependent": [] + }, + "docker/test/fuzzer": { + "name": "clickhouse/fuzzer", + "dependent": [] + }, + "docker/test/performance-comparison": { + "name": "clickhouse/performance-comparison", + "dependent": [] + }, + "docker/test/pvs": { + "name": "clickhouse/pvs-test", + "dependent": [] + }, + "docker/test/util": { + "name": "clickhouse/test-util", + "dependent": [ + "docker/test/base", + "docker/test/fasttest" + ] + }, + "docker/test/stateless": { + "name": "clickhouse/stateless-test", + "dependent": [ + "docker/test/stateful", + "docker/test/unit" + ] + }, + "docker/test/stateful": { + "name": "clickhouse/stateful-test", + "dependent": [ + "docker/test/stress" + ] + }, + "docker/test/unit": { + "name": "clickhouse/unit-test", + "dependent": [] + }, + "docker/test/stress": { + "name": "clickhouse/stress-test", + "dependent": [] + }, + "docker/test/split_build_smoke_test": { + "name": "clickhouse/split-build-smoke-test", + "dependent": [] + }, + "docker/test/codebrowser": { + "name": "clickhouse/codebrowser", + "dependent": [] + }, + "docker/test/integration/runner": { + "name": "clickhouse/integration-tests-runner", + "dependent": [] + }, + "docker/test/testflows/runner": { + "name": "clickhouse/testflows-runner", + "dependent": [] + }, + "docker/test/fasttest": { + "name": "clickhouse/fasttest", + "dependent": [] + }, + "docker/test/style": { + "name": "clickhouse/style-test", + "dependent": [] + }, + "docker/test/integration/s3_proxy": { + "name": "clickhouse/s3-proxy", + "dependent": [] + }, + "docker/test/integration/resolver": { + "name": "clickhouse/python-bottle", + "dependent": [] + }, + "docker/test/integration/helper_container": { + "name": "clickhouse/integration-helper", + "dependent": [] + }, + "docker/test/integration/mysql_golang_client": { + "name": "clickhouse/mysql-golang-client", + "dependent": [] + }, + "docker/test/integration/dotnet_client": { + "name": "clickhouse/dotnet-client", + "dependent": [] + }, + "docker/test/integration/mysql_java_client": { + "name": "clickhouse/mysql-java-client", + "dependent": [] + }, + "docker/test/integration/mysql_js_client": { + "name": "clickhouse/mysql-js-client", + "dependent": [] + }, + "docker/test/integration/mysql_php_client": { + "name": "clickhouse/mysql-php-client", + "dependent": [] + }, + "docker/test/integration/postgresql_java_client": { + "name": "clickhouse/postgresql-java-client", + "dependent": [] + }, + "docker/test/integration/kerberos_kdc": { + "name": "clickhouse/kerberos-kdc", + "dependent": [] + }, + "docker/test/base": { + "name": "clickhouse/test-base", + "dependent": [ + "docker/test/stateless", + "docker/test/integration/base", + "docker/test/fuzzer", + "docker/test/keeper-jepsen" + ] + }, + "docker/test/integration/kerberized_hadoop": { + "name": "clickhouse/kerberized-hadoop", + "dependent": [] + }, + "docker/test/sqlancer": { + "name": "clickhouse/sqlancer-test", + "dependent": [] + }, + "docker/test/keeper-jepsen": { + "name": "clickhouse/keeper-jepsen-test", + "dependent": [] + }, + "docker/docs/builder": { + "name": "clickhouse/docs-builder", + "dependent": [ + "docker/docs/check", + "docker/docs/release" + ] + }, + "docker/docs/check": { + "name": "clickhouse/docs-check", + "dependent": [] + }, + "docker/docs/release": { + "name": "clickhouse/docs-release", + "dependent": [] + } +} diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index be5a3d2c5cc..9cea6db3da2 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -22,8 +22,12 @@ SUSPICIOUS_PATTERNS = [ "release", ] +# Number of retries for API calls. MAX_RETRY = 5 -MAX_WORKFLOW_RERUN = 7 + +# Number of times a check can re-run as a whole. +# It is needed, because we are using AWS "spot" instances, that are terminated very frequently. +MAX_WORKFLOW_RERUN = 20 WorkflowDescription = namedtuple( "WorkflowDescription", diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index f57ade79471..68c695f57a0 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -10,7 +10,7 @@ find_program(PYTEST_CMD pytest) find_program(SUDO_CMD sudo) # will mount only one binary to docker container - build with .so cant work -if(MAKE_STATIC_LIBRARIES AND DOCKER_CMD) +if(USE_STATIC_LIBRARIES AND DOCKER_CMD) if(INTEGRATION_USE_RUNNER AND SUDO_CMD) add_test(NAME integration-runner WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${SUDO_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/runner --binary ${ClickHouse_BINARY_DIR}/programs/clickhouse --configs-dir ${ClickHouse_SOURCE_DIR}/programs/server/) message(STATUS "Using tests in docker with runner SUDO=${SUDO_CMD}; DOCKER=${DOCKER_CMD};") diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 6058a332c29..40cb2c6fdd7 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -64,38 +64,45 @@ def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] -def parse_test_results_output(fname): - read = False - description_output = [] - with open(fname, 'r') as out: - for line in out: - if read and line.strip() and not line.startswith('=='): - description_output.append(line.strip()) - if 'short test summary info' in line: - read = True - return description_output - - -def get_counters(output): +def get_counters(fname): counters = { - "ERROR": set([]), - "PASSED": set([]), - "FAILED": set([]), + "ERROR": set([]), + "PASSED": set([]), + "FAILED": set([]), + "SKIPPED": set([]), } - for line in output: - if '.py' in line: + with open(fname, 'r') as out: + for line in out: + line = line.strip() + # Example of log: + # + # test_mysql_protocol/test.py::test_golang_client + # [gw0] [ 7%] ERROR test_mysql_protocol/test.py::test_golang_client + # + # And only the line with test status should be matched + if not('.py::' in line and ' ' in line): + continue + line_arr = line.strip().split(' ') - state = line_arr[0] - test_name = ' '.join(line_arr[1:]) - if ' - ' in test_name: - test_name = test_name[:test_name.find(' - ')] + if len(line_arr) < 2: + logging.debug("Strange line %s", line) + continue + + # Lines like: + # [gw0] [ 7%] ERROR test_mysql_protocol/test.py::test_golang_client + state = line_arr[-2] + test_name = line_arr[-1] + if state in counters: counters[state].add(test_name) else: - logging.info("Strange line %s", line) - else: - logging.info("Strange line %s", line) + # will skip lines line: + # 30.76s call test_host_ip_change/test.py::test_ip_change_drop_dns_cache + # 5.71s teardown test_host_ip_change/test.py::test_user_access_ip_change[node1] + # and similar + logging.debug("Strange state in line %s", line) + return {k: list(v) for k, v in counters.items()} @@ -459,7 +466,12 @@ class ClickhouseIntegrationTestsRunner: test_cmd = ' '.join([test for test in sorted(test_names)]) parallel_cmd = " --parallel {} ".format(num_workers) if num_workers > 0 else "" - cmd = "cd {}/tests/integration && timeout -s 9 1h ./runner {} {} -t {} {} '-rfEp --run-id={} --color=no --durations=0 {}' | tee {}".format( + # -r -- show extra test summary: + # -f -- (f)ailed + # -E -- (E)rror + # -p -- (p)assed + # -s -- (s)kipped + cmd = "cd {}/tests/integration && timeout -s 9 1h ./runner {} {} -t {} {} '-rfEps --run-id={} --color=no --durations=0 {}' | tee {}".format( repo_path, self._get_runner_opts(), image_cmd, test_cmd, parallel_cmd, i, _get_deselect_option(self.should_skip_tests()), info_path) log_basename = test_group_str + "_" + str(i) + ".log" @@ -490,8 +502,9 @@ class ClickhouseIntegrationTestsRunner: if os.path.exists(info_path): extra_logs_names.append(info_basename) - lines = parse_test_results_output(info_path) - new_counters = get_counters(lines) + new_counters = get_counters(info_path) + for state, tests in new_counters.items(): + logging.info("Tests with %s state (%s): %s", state, len(tests), tests) times_lines = parse_test_times(info_path) new_tests_times = get_test_times(times_lines) self._update_counters(counters, new_counters) @@ -521,6 +534,7 @@ class ClickhouseIntegrationTestsRunner: for test in tests_in_group: if (test not in counters["PASSED"] and test not in counters["ERROR"] and + test not in counters["SKIPPED"] and test not in counters["FAILED"] and '::' in test): counters["ERROR"].append(test) diff --git a/tests/integration/test_cgroup_limit/__init__.py b/tests/integration/test_cgroup_limit/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_cgroup_limit/test.py b/tests/integration/test_cgroup_limit/test.py new file mode 100644 index 00000000000..c3a92bee032 --- /dev/null +++ b/tests/integration/test_cgroup_limit/test.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import os +import math +import subprocess +from tempfile import NamedTemporaryFile +import pytest + +def run_command_in_container(cmd, *args): + # /clickhouse is mounted by integration tests runner + alternative_binary = os.getenv('CLICKHOUSE_BINARY', '/clickhouse') + if alternative_binary: + args += ( + '--volume', f'{alternative_binary}:/usr/bin/clickhouse', + ) + + return subprocess.check_output(['docker', 'run', '--rm', + *args, + 'ubuntu:20.04', + 'sh', '-c', cmd, + ]) + +def run_with_cpu_limit(cmd, num_cpus, *args): + args += ( + '--cpus', f'{num_cpus}', + ) + return run_command_in_container(cmd, *args) + +def test_cgroup_cpu_limit(): + for num_cpus in (1, 2, 4, 2.8): + result = run_with_cpu_limit('clickhouse local -q "select value from system.settings where name=\'max_threads\'"', num_cpus) + expect_output = (r"\'auto({})\'".format(math.ceil(num_cpus))).encode() + assert result.strip() == expect_output, f"fail for cpu limit={num_cpus}, result={result.strip()}, expect={expect_output}" + +# For manual run +if __name__ == '__main__': + test_cgroup_cpu_limit() diff --git a/tests/integration/test_jemalloc_percpu_arena/test.py b/tests/integration/test_jemalloc_percpu_arena/test.py index bdd0ada966f..6a4522c1b76 100755 --- a/tests/integration/test_jemalloc_percpu_arena/test.py +++ b/tests/integration/test_jemalloc_percpu_arena/test.py @@ -12,7 +12,7 @@ CPU_ID = 4 def run_command_in_container(cmd, *args): - # /clickhouse is mounted by interation tests runner + # /clickhouse is mounted by integration tests runner alternative_binary = os.getenv('CLICKHOUSE_BINARY', '/clickhouse') if alternative_binary: args+=( diff --git a/tests/integration/test_keeper_and_access_storage/__init__.py b/tests/integration/test_keeper_and_access_storage/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_keeper_and_access_storage/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_keeper_and_access_storage/configs/keeper.xml b/tests/integration/test_keeper_and_access_storage/configs/keeper.xml new file mode 100644 index 00000000000..6dd54aebed1 --- /dev/null +++ b/tests/integration/test_keeper_and_access_storage/configs/keeper.xml @@ -0,0 +1,36 @@ + + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + 5000 + trace + 10000 + + + + true + node1 + 1 + 2888 + 1 + + + + + + + /clickhouse/access + + + + + + node1 + 9181 + + + diff --git a/tests/integration/test_keeper_and_access_storage/test.py b/tests/integration/test_keeper_and_access_storage/test.py new file mode 100644 index 00000000000..3a3c7535a85 --- /dev/null +++ b/tests/integration/test_keeper_and_access_storage/test.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', main_configs=['configs/keeper.xml'], stay_alive=True) + +# test that server is able to start +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + +def test_create_replicated(started_cluster): + assert node1.query("SELECT 1") == "1\n" diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 1528103e1cb..377a48be7ed 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -1141,14 +1141,14 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m `v19` datetime(6) DEFAULT CURRENT_TIMESTAMP(6), `v20` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, `v21` TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6), - /* todo support */ - # `v22` YEAR, - # `v23` TIME, - # `v24` TIME(3), - # `v25` GEOMETRY, + `v22` YEAR, + `v23` TIME, + `v24` TIME(6), + `v25` GEOMETRY, `v26` bit(4), + /* todo support */ # `v27` JSON DEFAULT NULL, - # `v28` set('a', 'c', 'f', 'd', 'e', 'b'), + `v28` set('a', 'c', 'f', 'd', 'e', 'b'), `v29` mediumint(4) unsigned NOT NULL DEFAULT '0', `v30` varbinary(255) DEFAULT NULL COMMENT 'varbinary support', `v31` binary(200) DEFAULT NULL, @@ -1158,8 +1158,9 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m """) mysql_node.query(""" - INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values - (1, 11, 9223372036854775807, -1, 1, 11, 18446744073709551615, -1.1, 1.1, -1.111, 1.111, 1.1111, '2021-10-06', 'text', 'varchar', 'BLOB', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', b'1010', 11, 'varbinary', 'binary', 'RED'); + INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v28, v29, v30, v31, v32) values + (1, 11, 9223372036854775807, -1, 1, 11, 18446744073709551615, -1.1, 1.1, -1.111, 1.111, 1.1111, '2021-10-06', 'text', 'varchar', 'BLOB', '2021-10-06 18:32:57', + '2021-10-06 18:32:57.482786', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', '2021', '838:59:59', '838:59:59.000000', ST_GeometryFromText('point(0.0 0.0)'), b'1010', 'a', 11, 'varbinary', 'binary', 'RED'); """) clickhouse_node.query( "CREATE DATABASE test_database_datatype ENGINE = MaterializeMySQL('{}:3306', 'test_database_datatype', 'root', 'clickhouse')".format( @@ -1167,14 +1168,18 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m check_query(clickhouse_node, "SELECT name FROM system.tables WHERE database = 'test_database_datatype'", "t1\n") # full synchronization check - check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV", - "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n") + check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, hex(v25), v26, v28, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV", + "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57" + + "\t2021-10-06 18:32:57.482786\t2021\t3020399000000\t3020399000000\t00000000010100000000000000000000000000000000000000\t10\t1\t11\tvarbinary\tRED\n") mysql_node.query(""" - INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values - (2, 22, 9223372036854775807, -2, 2, 22, 18446744073709551615, -2.2, 2.2, -2.22, 2.222, 2.2222, '2021-10-07', 'text', 'varchar', 'BLOB', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', b'1011', 22, 'varbinary', 'binary', 'GREEN' ); + INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v28, v29, v30, v31, v32) values + (2, 22, 9223372036854775807, -2, 2, 22, 18446744073709551615, -2.2, 2.2, -2.22, 2.222, 2.2222, '2021-10-07', 'text', 'varchar', 'BLOB', '2021-10-07 18:32:57', + '2021-10-07 18:32:57.482786', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', '2021', '-838:59:59', '-12:59:58.000001', ST_GeometryFromText('point(120.153576 30.287459)'), b'1011', 'a,c', 22, 'varbinary', 'binary', 'GREEN' ); """) # increment synchronization check - check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 ORDER BY v1 FORMAT TSV", - "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n" + - "2\t2\t22\t9223372036854775807\t-2\t2\t22\t18446744073709551615\t-2.2\t2.2\t-2.22\t2.222\t2.2222\t2021-10-07\ttext\tvarchar\tBLOB\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t11\t22\tvarbinary\tGREEN\n") + check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, hex(v25), v26, v28, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV", + "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786" + + "\t2021\t3020399000000\t3020399000000\t00000000010100000000000000000000000000000000000000\t10\t1\t11\tvarbinary\tRED\n" + + "2\t2\t22\t9223372036854775807\t-2\t2\t22\t18446744073709551615\t-2.2\t2.2\t-2.22\t2.222\t2.2222\t2021-10-07\ttext\tvarchar\tBLOB\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786" + + "\t2021\t-3020399000000\t-46798000001\t000000000101000000D55C6E30D4095E40DCF0BBE996493E40\t11\t3\t22\tvarbinary\tGREEN\n") diff --git a/tests/integration/test_postgresql_database_engine/test.py b/tests/integration/test_postgresql_database_engine/test.py index 656f655cfb3..7cd632cae6e 100644 --- a/tests/integration/test_postgresql_database_engine/test.py +++ b/tests/integration/test_postgresql_database_engine/test.py @@ -1,5 +1,4 @@ import pytest -import time import psycopg2 from helpers.cluster import ClickHouseCluster @@ -10,10 +9,14 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', main_configs=["configs/named_collections.xml"], with_postgres=True) postgres_table_template = """ - CREATE TABLE IF NOT EXISTS {} ( + CREATE TABLE {} ( id Integer NOT NULL, value Integer, PRIMARY KEY (id)) """ +postgres_drop_table_template = """ + DROP TABLE {} + """ + def get_postgres_conn(cluster, database=False): if database == True: conn_string = f"host={cluster.postgres_ip} port={cluster.postgres_port} dbname='test_database' user='postgres' password='mysecretpassword'" @@ -31,6 +34,9 @@ def create_postgres_table(cursor, table_name): # database was specified in connection string cursor.execute(postgres_table_template.format(table_name)) +def drop_postgres_table(cursor, table_name): + # database was specified in connection string + cursor.execute(postgres_drop_table_template.format(table_name)) @pytest.fixture(scope="module") def started_cluster(): @@ -66,6 +72,8 @@ def test_postgres_database_engine_with_postgres_ddl(started_cluster): node1.query("DROP DATABASE test_database") assert 'test_database' not in node1.query('SHOW DATABASES') + drop_postgres_table(cursor, 'test_table') + def test_postgresql_database_engine_with_clickhouse_ddl(started_cluster): conn = get_postgres_conn(started_cluster, True) @@ -92,6 +100,8 @@ def test_postgresql_database_engine_with_clickhouse_ddl(started_cluster): node1.query("DROP DATABASE test_database") assert 'test_database' not in node1.query('SHOW DATABASES') + drop_postgres_table(cursor, 'test_table') + def test_postgresql_database_engine_queries(started_cluster): conn = get_postgres_conn(started_cluster, True) @@ -106,7 +116,7 @@ def test_postgresql_database_engine_queries(started_cluster): node1.query("INSERT INTO test_database.test_table SELECT number, number from numbers(10000)") assert node1.query("SELECT count() FROM test_database.test_table").rstrip() == '10000' - cursor.execute('DROP TABLE test_table;') + drop_postgres_table(cursor, 'test_table') assert 'test_table' not in node1.query('SHOW TABLES FROM test_database') node1.query("DROP DATABASE test_database") @@ -121,7 +131,7 @@ def test_get_create_table_query_with_multidim_arrays(started_cluster): "CREATE DATABASE test_database ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword')") cursor.execute(""" - CREATE TABLE IF NOT EXISTS array_columns ( + CREATE TABLE array_columns ( b Integer[][][] NOT NULL, c Integer[][][] )""") @@ -144,6 +154,7 @@ def test_get_create_table_query_with_multidim_arrays(started_cluster): node1.query("DROP DATABASE test_database") assert 'test_database' not in node1.query('SHOW DATABASES') + drop_postgres_table(cursor, 'array_columns') def test_postgresql_database_engine_table_cache(started_cluster): @@ -187,9 +198,6 @@ def test_postgresql_database_with_schema(started_cluster): conn = get_postgres_conn(started_cluster, True) cursor = conn.cursor() - cursor.execute('DROP SCHEMA IF EXISTS test_schema CASCADE') - cursor.execute('DROP SCHEMA IF EXISTS "test.nice.schema" CASCADE') - cursor.execute('CREATE SCHEMA test_schema') cursor.execute('CREATE TABLE test_schema.table1 (a integer)') cursor.execute('CREATE TABLE test_schema.table2 (a integer)') @@ -207,6 +215,9 @@ def test_postgresql_database_with_schema(started_cluster): assert node1.query("SELECT count() FROM test_database.table1").rstrip() == '10000' node1.query("DROP DATABASE test_database") + cursor.execute('DROP SCHEMA test_schema CASCADE') + cursor.execute('DROP TABLE table3') + def test_predefined_connection_configuration(started_cluster): cursor = started_cluster.postgres_conn.cursor() @@ -218,7 +229,6 @@ def test_predefined_connection_configuration(started_cluster): node1.query("INSERT INTO postgres_database.test_table SELECT number, number from numbers(100)") assert (node1.query(f"SELECT count() FROM postgres_database.test_table").rstrip() == '100') - cursor.execute('DROP SCHEMA IF EXISTS test_schema') cursor.execute('CREATE SCHEMA test_schema') cursor.execute('CREATE TABLE test_schema.test_table (a integer)') @@ -236,6 +246,8 @@ def test_predefined_connection_configuration(started_cluster): node1.query("DROP DATABASE postgres_database") cursor.execute(f'DROP TABLE test_table ') + cursor.execute('DROP SCHEMA IF EXISTS test_schema CASCADE') + if __name__ == '__main__': diff --git a/tests/integration/test_replica_is_active/test.py b/tests/integration/test_replica_is_active/test.py index 14046ea7f7d..f786ff71958 100644 --- a/tests/integration/test_replica_is_active/test.py +++ b/tests/integration/test_replica_is_active/test.py @@ -1,6 +1,7 @@ import pytest from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster +from ast import literal_eval cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', with_zookeeper=True) @@ -30,12 +31,12 @@ def start_cluster(): def test_replica_is_active(start_cluster): query_result = node1.query("select replica_is_active from system.replicas where table = 'test_table'") - assert query_result == '{\'node1\':1,\'node2\':1,\'node3\':1}\n' + assert literal_eval(query_result) == {'node1': 1, 'node2': 1, 'node3': 1} node3.stop() query_result = node1.query("select replica_is_active from system.replicas where table = 'test_table'") - assert query_result == '{\'node1\':1,\'node2\':1,\'node3\':0}\n' + assert literal_eval(query_result) == {'node1': 1, 'node2': 1, 'node3': 0} node2.stop() query_result = node1.query("select replica_is_active from system.replicas where table = 'test_table'") - assert query_result == '{\'node1\':1,\'node2\':0,\'node3\':0}\n' + assert literal_eval(query_result) == {'node1': 1, 'node2': 0, 'node3': 0} diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 3c7104749a9..b0836a38c9e 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -366,6 +366,43 @@ def test_hdfs_directory_not_exist(started_cluster): node1.query(ddl) assert "" == node1.query("select * from HDFSStorageWithNotExistDir") +def test_overwrite(started_cluster): + hdfs_api = started_cluster.hdfs_api + + table_function = f"hdfs('hdfs://hdfs1:9000/data', 'Parquet', 'a Int32, b String')" + node1.query(f"create table test_overwrite as {table_function}") + node1.query(f"insert into test_overwrite select number, randomString(100) from numbers(5)") + node1.query_and_get_error(f"insert into test_overwrite select number, randomString(100) FROM numbers(10)") + node1.query(f"insert into test_overwrite select number, randomString(100) from numbers(10) settings hdfs_truncate_on_insert=1") + + result = node1.query(f"select count() from test_overwrite") + assert(int(result) == 10) + + +def test_multiple_inserts(started_cluster): + hdfs_api = started_cluster.hdfs_api + + table_function = f"hdfs('hdfs://hdfs1:9000/data_multiple_inserts', 'Parquet', 'a Int32, b String')" + node1.query(f"create table test_multiple_inserts as {table_function}") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(10)") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings hdfs_create_new_file_on_insert=1") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings hdfs_create_new_file_on_insert=1") + + result = node1.query(f"select count() from test_multiple_inserts") + assert(int(result) == 60) + + result = node1.query(f"drop table test_multiple_inserts") + + table_function = f"hdfs('hdfs://hdfs1:9000/data_multiple_inserts.gz', 'Parquet', 'a Int32, b String')" + node1.query(f"create table test_multiple_inserts as {table_function}") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) FROM numbers(10)") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) FROM numbers(20) settings hdfs_create_new_file_on_insert=1") + node1.query(f"insert into test_multiple_inserts select number, randomString(100) FROM numbers(30) settings hdfs_create_new_file_on_insert=1") + + result = node1.query(f"select count() from test_multiple_inserts") + assert(int(result) == 60) + + def test_format_detection(started_cluster): node1.query(f"create table arrow_table (x UInt64) engine=HDFS('hdfs://hdfs1:9000/data.arrow')") node1.query(f"insert into arrow_table select 1") diff --git a/tests/integration/test_storage_s3/configs/named_collections.xml b/tests/integration/test_storage_s3/configs/named_collections.xml index ef21ced4d0c..f22440d17c9 100644 --- a/tests/integration/test_storage_s3/configs/named_collections.xml +++ b/tests/integration/test_storage_s3/configs/named_collections.xml @@ -10,6 +10,11 @@ minio minio123 + + http://minio1:9001/root/test_parquet_gz + minio + minio123 + http://minio1:9001/root/test_orc minio diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 0584ccf79b0..a804053d4fd 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -136,7 +136,7 @@ def test_put(started_cluster, maybe_auth, positive, compression): values_csv = "1,2,3\n3,2,1\n78,43,45\n" filename = "test.csv" put_query = f"""insert into table function s3('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{filename}', - {maybe_auth}'CSV', '{table_format}', {compression}) values {values}""" + {maybe_auth}'CSV', '{table_format}', {compression}) values settings s3_truncate_on_insert=1 {values}""" try: run_query(instance, put_query) @@ -298,7 +298,7 @@ def test_put_csv(started_cluster, maybe_auth, positive): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" - put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format( + put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV settings s3_truncate_on_insert=1".format( started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, maybe_auth, table_format) csv_data = "8,9,16\n11,18,13\n22,14,2\n" @@ -322,7 +322,7 @@ def test_put_get_with_redirect(started_cluster): values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)" values_csv = "1,1,1\n1,1,1\n11,11,11\n" filename = "test.csv" - query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values settings s3_truncate_on_insert=1 {}".format( started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values) run_query(instance, query) @@ -350,12 +350,12 @@ def test_put_with_zero_redirect(started_cluster): filename = "test.csv" # Should work without redirect - query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values settings s3_truncate_on_insert=1 {}".format( started_cluster.minio_ip, MINIO_INTERNAL_PORT, bucket, filename, table_format, values) run_query(instance, query) # Should not work with redirect - query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values settings s3_truncate_on_insert=1 {}".format( started_cluster.minio_redirect_host, started_cluster.minio_redirect_port, bucket, filename, table_format, values) exception_raised = False try: @@ -805,13 +805,13 @@ def test_seekable_formats(started_cluster): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" - instance.query(f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)") + instance.query(f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) settings s3_truncate_on_insert=1") result = instance.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" - exec_query_with_retry(instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)") + exec_query_with_retry(instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) settings s3_truncate_on_insert=1") result = instance.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) @@ -827,14 +827,14 @@ def test_seekable_formats_url(started_cluster): instance = started_cluster.instances["dummy"] table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" - instance.query(f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)") + instance.query(f"insert into table function {table_function} select number, randomString(100) from numbers(5000000) settings s3_truncate_on_insert=1") table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')" result = instance.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" - exec_query_with_retry(instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)") + exec_query_with_retry(instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000) settings s3_truncate_on_insert=1") table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_orc', 'ORC', 'a Int32, b String')" result = instance.query(f"SELECT count() FROM {table_function}") @@ -917,6 +917,48 @@ def test_empty_file(started_cluster): assert(int(result) == 0) +def test_overwrite(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" + instance.query(f"create table test_overwrite as {table_function}") + instance.query(f"truncate table test_overwrite") + instance.query(f"insert into test_overwrite select number, randomString(100) from numbers(50) settings s3_truncate_on_insert=1") + instance.query_and_get_error(f"insert into test_overwrite select number, randomString(100) from numbers(100)") + instance.query(f"insert into test_overwrite select number, randomString(100) from numbers(200) settings s3_truncate_on_insert=1") + + result = instance.query(f"select count() from test_overwrite") + assert(int(result) == 200) + + +def test_create_new_files_on_insert(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" + instance.query(f"create table test_multiple_inserts as {table_function}") + instance.query(f"truncate table test_multiple_inserts") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1") + + result = instance.query(f"select count() from test_multiple_inserts") + assert(int(result) == 60) + + instance.query(f"drop table test_multiple_inserts") + + table_function = f"s3(s3_parquet_gz, structure='a Int32, b String', format='Parquet')" + instance.query(f"create table test_multiple_inserts as {table_function}") + instance.query(f"truncate table test_multiple_inserts") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings s3_truncate_on_insert=1") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings s3_create_new_file_on_insert=1") + instance.query(f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings s3_create_new_file_on_insert=1") + + result = instance.query(f"select count() from test_multiple_inserts") + assert(int(result) == 60) + + def test_format_detection(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] diff --git a/tests/performance/classification.xml b/tests/performance/classification.xml new file mode 100644 index 00000000000..370e2c49d29 --- /dev/null +++ b/tests/performance/classification.xml @@ -0,0 +1,20 @@ + + + 1 + + + + hits_100m_single + + + SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null + + + SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null + + diff --git a/tests/performance/order_by_tuple.xml b/tests/performance/order_by_tuple.xml new file mode 100644 index 00000000000..72fb1812bbc --- /dev/null +++ b/tests/performance/order_by_tuple.xml @@ -0,0 +1,8 @@ + + + sorting + comparison + + + select * from numbers(300000000) order by (1 - number , number + 1 , number) limit 10; + diff --git a/tests/queries/0_stateless/00312_position_case_insensitive_utf8.reference b/tests/queries/0_stateless/00312_position_case_insensitive_utf8.reference index 86745bb6299..f56ae0763f7 100644 --- a/tests/queries/0_stateless/00312_position_case_insensitive_utf8.reference +++ b/tests/queries/0_stateless/00312_position_case_insensitive_utf8.reference @@ -90,3 +90,31 @@ 21 22 23 +6 +7 +7 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 diff --git a/tests/queries/0_stateless/00312_position_case_insensitive_utf8.sql b/tests/queries/0_stateless/00312_position_case_insensitive_utf8.sql index 010a1d78dfa..8bb8512fa4b 100644 --- a/tests/queries/0_stateless/00312_position_case_insensitive_utf8.sql +++ b/tests/queries/0_stateless/00312_position_case_insensitive_utf8.sql @@ -93,3 +93,34 @@ SELECT position(concat(' иголка.ру', arrayStringConcat SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res; SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res; SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res; + +SELECT positionCaseInsensitiveUTF8(materialize('test ß test'), 'ß') AS res; +SELECT positionCaseInsensitiveUTF8(materialize('test AaßAa test'), 'aßa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize('test A1ß2a test'), '1ß2') AS res; +SELECT positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; + +SELECT positionCaseInsensitiveUTF8(materialize(concat('test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res; + +SELECT positionCaseInsensitiveUTF8(materialize(concat('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; +SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res; diff --git a/tests/queries/0_stateless/00985_merge_stack_overflow.sql b/tests/queries/0_stateless/00985_merge_stack_overflow.sql index 3a3e5640a38..1f114f4710d 100644 --- a/tests/queries/0_stateless/00985_merge_stack_overflow.sql +++ b/tests/queries/0_stateless/00985_merge_stack_overflow.sql @@ -1,11 +1,14 @@ +-- Tags: no-parallel +-- ^^^^^^^^^^^ otherwise you may hit TOO_DEEP_RECURSION error during querying system.columns + DROP TABLE IF EXISTS merge1; DROP TABLE IF EXISTS merge2; CREATE TABLE IF NOT EXISTS merge1 (x UInt64) ENGINE = Merge(currentDatabase(), '^merge\\d$'); CREATE TABLE IF NOT EXISTS merge2 (x UInt64) ENGINE = Merge(currentDatabase(), '^merge\\d$'); -SELECT * FROM merge1; -- { serverError 306 } -SELECT * FROM merge2; -- { serverError 306 } +SELECT * FROM merge1; -- { serverError TOO_DEEP_RECURSION } +SELECT * FROM merge2; -- { serverError TOO_DEEP_RECURSION } DROP TABLE merge1; DROP TABLE merge2; diff --git a/tests/queries/0_stateless/01018_Distributed__shard_num.reference b/tests/queries/0_stateless/01018_Distributed__shard_num.reference index b2c8b77554b..46963a006ec 100644 --- a/tests/queries/0_stateless/01018_Distributed__shard_num.reference +++ b/tests/queries/0_stateless/01018_Distributed__shard_num.reference @@ -1,34 +1,94 @@ +-- { echoOn } + +-- remote(system.one) +SELECT 'remote(system.one)'; remote(system.one) +SELECT * FROM remote('127.0.0.1', system.one); +0 +SELECT * FROM remote('127.0.0.{1,2}', system.one); 0 0 -0 +SELECT _shard_num, * FROM remote('127.0.0.1', system.one); 1 0 +SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) order by _shard_num; 1 0 2 0 +SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) WHERE _shard_num = 1; 1 0 +-- dist_1 using test_shard_localhost +SELECT 'dist_1'; dist_1 -1 -1 10 -10 +SELECT _shard_num FROM dist_1 order by _shard_num; 1 1 +SELECT _shard_num FROM dist_1 order by _shard_num; +1 +1 +SELECT _shard_num, key FROM dist_1 order by _shard_num; 1 10 1 20 +SELECT key FROM dist_1; 10 20 +SELECT _shard_num FROM dist_1 order by _shard_num; +1 +1 +SELECT _shard_num, key FROM dist_1 order by _shard_num, key; +1 10 +1 20 +SELECT key FROM dist_1; +10 +20 +-- dist_2 using test_cluster_two_shards_localhost +SELECT 'dist_2'; dist_2 +SELECT _shard_num FROM dist_2 order by _shard_num; 1 2 +SELECT _shard_num FROM dist_2 order by _shard_num; +1 +2 +SELECT _shard_num, key FROM dist_2 order by _shard_num, key; 1 100 2 100 +SELECT key FROM dist_2; 100 100 +-- multiple _shard_num +SELECT 'remote(Distributed)'; remote(Distributed) +SELECT _shard_num, key FROM remote('127.0.0.1', currentDatabase(), dist_2) order by _shard_num, key; 1 100 -1 100 +2 100 +-- JOIN system.clusters +SELECT 'JOIN system.clusters'; JOIN system.clusters +SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port +FROM (SELECT *, _shard_num FROM dist_1) a +JOIN system.clusters b +ON a._shard_num = b.shard_num +WHERE b.cluster = 'test_cluster_two_shards_localhost'; 1 10 localhost 1 9000 1 20 localhost 1 9000 +SELECT _shard_num, key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port +FROM dist_1 a +JOIN system.clusters b +ON _shard_num = b.shard_num +WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 403 } +SELECT 'Rewrite with alias'; +Rewrite with alias +SELECT a._shard_num, key FROM dist_1 a; +1 10 +1 20 +-- the same with JOIN, just in case +SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port +FROM dist_1 a +JOIN system.clusters b +ON a._shard_num = b.shard_num +WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 47; } +SELECT 'dist_3'; dist_3 +SELECT * FROM dist_3; 100 foo +SELECT _shard_num, * FROM dist_3 order by _shard_num; foo 100 foo diff --git a/tests/queries/0_stateless/01018_Distributed__shard_num.sql b/tests/queries/0_stateless/01018_Distributed__shard_num.sql index be2df8b664f..d3f4e1ac527 100644 --- a/tests/queries/0_stateless/01018_Distributed__shard_num.sql +++ b/tests/queries/0_stateless/01018_Distributed__shard_num.sql @@ -3,6 +3,28 @@ -- make the order static SET max_threads = 1; +DROP TABLE IF EXISTS mem1; +DROP TABLE IF EXISTS mem2; +DROP TABLE IF EXISTS mem3; +DROP TABLE IF EXISTS dist_1; +DROP TABLE IF EXISTS dist_2; +DROP TABLE IF EXISTS dist_3; + +CREATE TABLE mem1 (key Int) Engine=Memory(); +INSERT INTO mem1 VALUES (10); +CREATE TABLE dist_1 AS mem1 Engine=Distributed(test_shard_localhost, currentDatabase(), mem1); +INSERT INTO dist_1 VALUES (20); + +CREATE TABLE mem2 (key Int) Engine=Memory(); +INSERT INTO mem2 VALUES (100); +CREATE TABLE dist_2 AS mem2 Engine=Distributed(test_cluster_two_shards_localhost, currentDatabase(), mem2); + +CREATE TABLE mem3 (key Int, _shard_num String) Engine=Memory(); +INSERT INTO mem3 VALUES (100, 'foo'); +CREATE TABLE dist_3 AS mem3 Engine=Distributed(test_shard_localhost, currentDatabase(), mem3); + +-- { echoOn } + -- remote(system.one) SELECT 'remote(system.one)'; SELECT * FROM remote('127.0.0.1', system.one); @@ -13,27 +35,20 @@ SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) WHERE _shard_num = -- dist_1 using test_shard_localhost SELECT 'dist_1'; -CREATE TABLE mem1 (key Int) Engine=Memory(); -CREATE TABLE dist_1 AS mem1 Engine=Distributed(test_shard_localhost, currentDatabase(), mem1); SELECT _shard_num FROM dist_1 order by _shard_num; -INSERT INTO mem1 VALUES (10); SELECT _shard_num FROM dist_1 order by _shard_num; SELECT _shard_num, key FROM dist_1 order by _shard_num; SELECT key FROM dist_1; -INSERT INTO dist_1 VALUES (20); SELECT _shard_num FROM dist_1 order by _shard_num; SELECT _shard_num, key FROM dist_1 order by _shard_num, key; SELECT key FROM dist_1; -- dist_2 using test_cluster_two_shards_localhost SELECT 'dist_2'; -CREATE TABLE mem2 (key Int) Engine=Memory(); -CREATE TABLE dist_2 AS mem2 Engine=Distributed(test_cluster_two_shards_localhost, currentDatabase(), mem2); SELECT _shard_num FROM dist_2 order by _shard_num; -INSERT INTO mem2 VALUES (100); SELECT _shard_num FROM dist_2 order by _shard_num; SELECT _shard_num, key FROM dist_2 order by _shard_num, key; SELECT key FROM dist_2; @@ -57,8 +72,8 @@ JOIN system.clusters b ON _shard_num = b.shard_num WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 403 } --- rewrite does not work with aliases, hence Missing columns (47) -SELECT a._shard_num, key FROM dist_1 a; -- { serverError 47; } +SELECT 'Rewrite with alias'; +SELECT a._shard_num, key FROM dist_1 a; -- the same with JOIN, just in case SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port FROM dist_1 a @@ -67,8 +82,5 @@ ON a._shard_num = b.shard_num WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 47; } SELECT 'dist_3'; -CREATE TABLE mem3 (key Int, _shard_num String) Engine=Memory(); -CREATE TABLE dist_3 AS mem3 Engine=Distributed(test_shard_localhost, currentDatabase(), mem3); -INSERT INTO mem3 VALUES (100, 'foo'); SELECT * FROM dist_3; SELECT _shard_num, * FROM dist_3 order by _shard_num; diff --git a/tests/queries/0_stateless/01042_h3_k_ring.sql b/tests/queries/0_stateless/01042_h3_k_ring.sql index 50d69609ad8..d450954ab7a 100644 --- a/tests/queries/0_stateless/01042_h3_k_ring.sql +++ b/tests/queries/0_stateless/01042_h3_k_ring.sql @@ -1,16 +1,17 @@ -- Tags: no-fasttest -SELECT arraySort(h3kRing(581276613233082367, 1)); -SELECT h3kRing(581276613233082367, 0); -SELECT h3kRing(581276613233082367, -1); -- { serverError 12 } +SELECT arraySort(h3kRing(581276613233082367, toUInt16(1))); +SELECT h3kRing(581276613233082367, toUInt16(0)); +SELECT h3kRing(581276613233082367, -1); -- { serverError 43 } +SELECT h3kRing(581276613233082367, toUInt16(-1)); -- { serverError 12 } DROP TABLE IF EXISTS h3_indexes; -CREATE TABLE h3_indexes (h3_index UInt64, res UInt8) ENGINE = Memory; +-- Test h3 indices and k selected from original test fixture: https://github.com/uber/h3/blob/master/src/apps/testapps + +CREATE TABLE h3_indexes (h3_index UInt64, k UInt16) ENGINE = Memory; --- Random geo coordinates were generated using the H3 tool: https://github.com/ClickHouse-Extras/h3/blob/master/src/apps/testapps/mkRandGeo.c at various resolutions from 0 to 15. --- Corresponding H3 index values were in turn generated with those geo coordinates using `geoToH3(lon, lat, res)` ClickHouse function for the following test. INSERT INTO h3_indexes VALUES (579205133326352383,1); INSERT INTO h3_indexes VALUES (581263419093549055,2); @@ -30,6 +31,6 @@ INSERT INTO h3_indexes VALUES (639763125756281263,15); INSERT INTO h3_indexes VALUES (644178757620501158,16); -SELECT arraySort(h3kRing(h3_index, res)) FROM h3_indexes ORDER BY h3_index; +SELECT arraySort(h3kRing(h3_index, k)) FROM h3_indexes ORDER BY h3_index; DROP TABLE h3_indexes; diff --git a/tests/queries/0_stateless/01059_storage_file_compression.sh b/tests/queries/0_stateless/01059_storage_file_compression.sh index 8942113ab12..fbee5070d8d 100755 --- a/tests/queries/0_stateless/01059_storage_file_compression.sh +++ b/tests/queries/0_stateless/01059_storage_file_compression.sh @@ -12,7 +12,6 @@ do ${CLICKHOUSE_CLIENT} --query "CREATE TABLE file (x UInt64) ENGINE = File(TSV, '${CLICKHOUSE_DATABASE}/${m}.tsv.${m}')" ${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE file" ${CLICKHOUSE_CLIENT} --query "INSERT INTO file SELECT * FROM numbers(1000000)" - sleep 1 ${CLICKHOUSE_CLIENT} --query "SELECT count(), max(x) FROM file" ${CLICKHOUSE_CLIENT} --query "DROP TABLE file" done diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index cc237a40a3f..86ba859fb0e 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -99,14 +99,14 @@ SYSTEM RELOAD FUNCTION ['SYSTEM RELOAD FUNCTIONS','RELOAD FUNCTION','RELOAD FUNC SYSTEM RELOAD EMBEDDED DICTIONARIES ['RELOAD EMBEDDED DICTIONARIES'] GLOBAL SYSTEM RELOAD SYSTEM RELOAD [] \N SYSTEM SYSTEM RESTART DISK ['SYSTEM RESTART DISK'] GLOBAL SYSTEM -SYSTEM MERGES ['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP_MERGES','START MERGES'] TABLE SYSTEM +SYSTEM MERGES ['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP MERGES','START MERGES'] TABLE SYSTEM SYSTEM TTL MERGES ['SYSTEM STOP TTL MERGES','SYSTEM START TTL MERGES','STOP TTL MERGES','START TTL MERGES'] TABLE SYSTEM SYSTEM FETCHES ['SYSTEM STOP FETCHES','SYSTEM START FETCHES','STOP FETCHES','START FETCHES'] TABLE SYSTEM SYSTEM MOVES ['SYSTEM STOP MOVES','SYSTEM START MOVES','STOP MOVES','START MOVES'] TABLE SYSTEM SYSTEM DISTRIBUTED SENDS ['SYSTEM STOP DISTRIBUTED SENDS','SYSTEM START DISTRIBUTED SENDS','STOP DISTRIBUTED SENDS','START DISTRIBUTED SENDS'] TABLE SYSTEM SENDS -SYSTEM REPLICATED SENDS ['SYSTEM STOP REPLICATED SENDS','SYSTEM START REPLICATED SENDS','STOP_REPLICATED_SENDS','START REPLICATED SENDS'] TABLE SYSTEM SENDS +SYSTEM REPLICATED SENDS ['SYSTEM STOP REPLICATED SENDS','SYSTEM START REPLICATED SENDS','STOP REPLICATED SENDS','START REPLICATED SENDS'] TABLE SYSTEM SENDS SYSTEM SENDS ['SYSTEM STOP SENDS','SYSTEM START SENDS','STOP SENDS','START SENDS'] \N SYSTEM -SYSTEM REPLICATION QUEUES ['SYSTEM STOP REPLICATION QUEUES','SYSTEM START REPLICATION QUEUES','STOP_REPLICATION_QUEUES','START REPLICATION QUEUES'] TABLE SYSTEM +SYSTEM REPLICATION QUEUES ['SYSTEM STOP REPLICATION QUEUES','SYSTEM START REPLICATION QUEUES','STOP REPLICATION QUEUES','START REPLICATION QUEUES'] TABLE SYSTEM SYSTEM DROP REPLICA ['DROP REPLICA'] TABLE SYSTEM SYSTEM SYNC REPLICA ['SYNC REPLICA'] TABLE SYSTEM SYSTEM RESTART REPLICA ['RESTART REPLICA'] TABLE SYSTEM diff --git a/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference b/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference index 610041de31e..8c951058ea6 100644 --- a/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference +++ b/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference @@ -3,3 +3,7 @@ UTC 1234567891011 2009-02-13 23:31:31.011 1970-01-15 06:56:07.891011 1970-01-01 Asia/Makassar 1234567891011 2009-02-14 07:31:31.011 1970-01-15 14:56:07.891011 1970-01-01 08:20:34.567891011 DateTime64(9, \'Asia/Makassar\') non-const column 1234567891011 2009-02-13 23:31:31.011 1970-01-15 06:56:07.891011 1970-01-01 00:20:34.567891011 +upper range bound +9904447342 2283-11-10 19:22:22.123 2283-11-10 19:22:22.123456 1925-01-01 00:00:00.586094827 +lower range bound +-1420066799 1925-01-01 01:00:01.123 1925-01-01 01:00:01.123456 1925-01-01 01:00:01.123456789 diff --git a/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql b/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql index 4f1497763e1..e76a4db7a27 100644 --- a/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql +++ b/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql @@ -42,4 +42,30 @@ SELECT i64, fromUnixTimestamp64Milli(i64, tz), fromUnixTimestamp64Micro(i64, tz), - fromUnixTimestamp64Nano(i64, tz) as dt64; \ No newline at end of file + fromUnixTimestamp64Nano(i64, tz) as dt64; + +SELECT 'upper range bound'; +WITH + 9904447342 AS timestamp, + CAST(9904447342123 AS Int64) AS milli, + CAST(9904447342123456 AS Int64) AS micro, + CAST(9904447342123456789 AS Int64) AS nano, + 'UTC' AS tz +SELECT + timestamp, + fromUnixTimestamp64Milli(milli, tz), + fromUnixTimestamp64Micro(micro, tz), + fromUnixTimestamp64Nano(nano, tz); + +SELECT 'lower range bound'; +WITH + -1420066799 AS timestamp, + CAST(-1420066799123 AS Int64) AS milli, + CAST(-1420066799123456 AS Int64) AS micro, + CAST(-1420066799123456789 AS Int64) AS nano, + 'UTC' AS tz +SELECT + timestamp, + fromUnixTimestamp64Milli(milli, tz), + fromUnixTimestamp64Micro(micro, tz), + fromUnixTimestamp64Nano(nano, tz); \ No newline at end of file diff --git a/tests/queries/0_stateless/01442_h3kring_range_check.sql b/tests/queries/0_stateless/01442_h3kring_range_check.sql index 14a9fdabde8..ab8f69f345e 100644 --- a/tests/queries/0_stateless/01442_h3kring_range_check.sql +++ b/tests/queries/0_stateless/01442_h3kring_range_check.sql @@ -1,6 +1,6 @@ -- Tags: no-fasttest SELECT h3kRing(581276613233082367, 65535); -- { serverError 12 } -SELECT h3kRing(581276613233082367, -1); -- { serverError 12 } +SELECT h3kRing(581276613233082367, -1); -- { serverError 43 } SELECT length(h3kRing(111111111111, 1000)); SELECT h3kRing(581276613233082367, nan); -- { serverError 43 } diff --git a/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.reference b/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.reference index 1c7908bf830..351c70637c0 100644 --- a/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.reference +++ b/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.reference @@ -1,14 +1,31 @@ +-- { echo } +with anySimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(any, UInt64) 0 +with anyLastSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(anyLast, UInt64) 0 +with minSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(min, UInt64) 0 +with maxSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(max, UInt64) 0 +with sumSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(sum, UInt64) 0 +with sumWithOverflowSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(sumWithOverflow, UInt64) 0 +with groupBitAndSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(groupBitAnd, UInt64) 0 +with groupBitOrSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(groupBitOr, UInt64) 0 +with groupBitXorSimpleState(number) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(groupBitXor, UInt64) 0 +with sumMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(sumMap, Tuple(Array(UInt64), Array(UInt64))) ([],[]) +with minMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(minMap, Tuple(Array(UInt64), Array(UInt64))) ([0],[0]) +with maxMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(maxMap, Tuple(Array(UInt64), Array(UInt64))) ([0],[0]) +with groupArrayArraySimpleState([number]) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(groupArrayArray, Array(UInt64)) [0] +with groupUniqArrayArraySimpleState([number]) as c select toTypeName(c), c from numbers(1); SimpleAggregateFunction(groupUniqArrayArray, Array(UInt64)) [0] +-- non-SimpleAggregateFunction +with countSimpleState(number) as c select toTypeName(c), c from numbers(1); -- { serverError 36 } diff --git a/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.sql b/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.sql index 00a12a69d16..94f0589670f 100644 --- a/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.sql +++ b/tests/queries/0_stateless/01570_aggregator_combinator_simple_state.sql @@ -1,3 +1,4 @@ +-- { echo } with anySimpleState(number) as c select toTypeName(c), c from numbers(1); with anyLastSimpleState(number) as c select toTypeName(c), c from numbers(1); with minSimpleState(number) as c select toTypeName(c), c from numbers(1); diff --git a/tests/queries/0_stateless/01659_h3_buffer_overflow.sql b/tests/queries/0_stateless/01659_h3_buffer_overflow.sql index 0e40b9ef73f..afd5ae1097d 100644 --- a/tests/queries/0_stateless/01659_h3_buffer_overflow.sql +++ b/tests/queries/0_stateless/01659_h3_buffer_overflow.sql @@ -8,5 +8,5 @@ SELECT h3kRing(0xFFFFFFFFF, 1000) FORMAT Null; SELECT h3kRing(0xFFFFFFFFFFFFFF, 1000) FORMAT Null; SELECT h3GetBaseCell(0xFFFFFFFFFFFFFF) FORMAT Null; SELECT h3GetResolution(0xFFFFFFFFFFFFFF) FORMAT Null; -SELECT h3kRing(0xFFFFFFFFFFFFFF, 10) FORMAT Null; +SELECT h3kRing(0xFFFFFFFFFFFFFF, toUInt16(10)) FORMAT Null; SELECT h3ToGeo(0xFFFFFFFFFFFFFF) FORMAT Null; diff --git a/tests/queries/0_stateless/01674_unicode_asan.sql b/tests/queries/0_stateless/01674_unicode_asan.sql index 2ba34b46f93..85c210235c4 100644 --- a/tests/queries/0_stateless/01674_unicode_asan.sql +++ b/tests/queries/0_stateless/01674_unicode_asan.sql @@ -1,3 +1,2 @@ SELECT positionCaseInsensitiveUTF8('иголка.ру', 'иголка.р\0') AS res; -SELECT positionCaseInsensitiveUTF8('иголка.ру', randomString(rand() % 100)) FROM system.numbers; -- { serverError 2 } SELECT sum(ignore(positionCaseInsensitiveUTF8('иголка.ру', randomString(rand() % 2)))) FROM numbers(1000000); diff --git a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh index c5aaa794ac9..f5ab71d8d34 100755 --- a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh +++ b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh @@ -16,22 +16,54 @@ $CLICKHOUSE_CLIENT -nm -q "create database ordinary_$CLICKHOUSE_DATABASE engine= $CLICKHOUSE_CLIENT -nm -q """ use ordinary_$CLICKHOUSE_DATABASE; drop table if exists data_01810; - create table data_01810 (key Int) Engine=MergeTree() order by key partition by key settings max_part_removal_threads=10, concurrent_part_removal_threshold=49; - insert into data_01810 select * from numbers(50); + + create table data_01810 (key Int) + Engine=MergeTree() + order by key + partition by key%100 + settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; + + insert into data_01810 select * from numbers(100); drop table data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + + -- sometimes the same thread can be used to remove part, due to ThreadPool, + -- hence we cannot compare strictly. + select throwIf(not(length(thread_ids) between 6 and 11)) + from system.query_log + where + event_date >= yesterday() and + current_database = currentDatabase() and + query = 'drop table data_01810 settings log_queries=1;' and + type = 'QueryFinish' + format Null; """ # ReplicatedMergeTree $CLICKHOUSE_CLIENT -nm -q """ use ordinary_$CLICKHOUSE_DATABASE; drop table if exists rep_data_01810; - create table rep_data_01810 (key Int) Engine=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/rep_data_01810', '1') order by key partition by key settings max_part_removal_threads=10, concurrent_part_removal_threshold=49; - insert into rep_data_01810 select * from numbers(50); + + create table rep_data_01810 (key Int) + Engine=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/rep_data_01810', '1') + order by key + partition by key%100 + settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; + + insert into rep_data_01810 select * from numbers(100); drop table rep_data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + + -- sometimes the same thread can be used to remove part, due to ThreadPool, + -- hence we cannot compare strictly. + select throwIf(not(length(thread_ids) between 6 and 11)) + from system.query_log + where + event_date >= yesterday() and + current_database = currentDatabase() and + query = 'drop table rep_data_01810 settings log_queries=1;' and + type = 'QueryFinish' + format Null; """ $CLICKHOUSE_CLIENT -nm -q "drop database ordinary_$CLICKHOUSE_DATABASE" diff --git a/tests/queries/0_stateless/01947_multiple_pipe_read.sh b/tests/queries/0_stateless/01947_multiple_pipe_read.sh index de9ca47f8cf..51709eb574e 100755 --- a/tests/queries/0_stateless/01947_multiple_pipe_read.sh +++ b/tests/queries/0_stateless/01947_multiple_pipe_read.sh @@ -3,7 +3,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -SAMPLE_FILE=$(mktemp 01947_multiple_pipe_read_sample_data_XXXXXX.csv) +SAMPLE_FILE=$(mktemp 01947_multiple_pipe_read_sample_data_XXXXXX.tsv) echo 'File generated:' ${CLICKHOUSE_LOCAL} -q "SELECT number AS x, if(number in (4,6), 'AAA', 'BBB') AS s from numbers(7)" > "$SAMPLE_FILE" diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token.reference b/tests/queries/0_stateless/02124_insert_deduplication_token.reference new file mode 100644 index 00000000000..a29d74fdcd1 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token.reference @@ -0,0 +1,15 @@ +create and check deduplication +two inserts with exact data, one inserted, one deduplicated by data digest +0 1000 +two inserts with the same dedup token, one inserted, one deduplicated by the token +0 1000 +1 1001 +update dedup token, two inserts with the same dedup token, one inserted, one deduplicated by the token +0 1000 +1 1001 +1 1001 +reset deduplication token and insert new row +0 1000 +1 1001 +1 1001 +2 1002 diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token.sql b/tests/queries/0_stateless/02124_insert_deduplication_token.sql new file mode 100644 index 00000000000..4581ef995fd --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token.sql @@ -0,0 +1,33 @@ +-- insert data duplicates by providing deduplication token on insert + +DROP TABLE IF EXISTS insert_dedup_token SYNC; + +select 'create and check deduplication'; +CREATE TABLE insert_dedup_token ( + id Int32, val UInt32 +) ENGINE=MergeTree() ORDER BY id +SETTINGS non_replicated_deduplication_window=0xFFFFFFFF; + +select 'two inserts with exact data, one inserted, one deduplicated by data digest'; +INSERT INTO insert_dedup_token VALUES(0, 1000); +INSERT INTO insert_dedup_token VALUES(0, 1000); +SELECT * FROM insert_dedup_token ORDER BY id; + +select 'two inserts with the same dedup token, one inserted, one deduplicated by the token'; +set insert_deduplication_token = '\x61\x00\x62'; +INSERT INTO insert_dedup_token VALUES(1, 1001); +INSERT INTO insert_dedup_token VALUES(2, 1002); +SELECT * FROM insert_dedup_token ORDER BY id; + +select 'update dedup token, two inserts with the same dedup token, one inserted, one deduplicated by the token'; +set insert_deduplication_token = '\x61\x00\x63'; +INSERT INTO insert_dedup_token VALUES(1, 1001); +INSERT INTO insert_dedup_token VALUES(2, 1002); +SELECT * FROM insert_dedup_token ORDER BY id; + +select 'reset deduplication token and insert new row'; +set insert_deduplication_token = ''; +INSERT INTO insert_dedup_token VALUES(2, 1002); +SELECT * FROM insert_dedup_token ORDER BY id; + +DROP TABLE insert_dedup_token SYNC; diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.reference b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.reference new file mode 100644 index 00000000000..5cf6230fd85 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.reference @@ -0,0 +1,34 @@ +insert 2 blocks with dedup token, 1 row per block +2 +1 +2 +insert deduplicated by token +2 +1 +2 +insert the same data by providing different dedup token +4 +1 +1 +2 +2 +insert 4 blocks, 2 deduplicated, 2 inserted +6 +1 +1 +2 +2 +3 +4 +disable token based deduplication, insert the same data as with token +10 +1 +1 +1 +2 +2 +2 +3 +3 +4 +4 diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.sh b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.sh new file mode 100755 index 00000000000..b5f44794c60 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +QUERY_COUNT_ORIGIN_BLOCKS="SELECT COUNT(*) FROM system.parts WHERE database = currentDatabase() AND table = 'block_dedup_token' AND min_block_number == max_block_number;" +QUERY_SELECT_FROM_TABLE_ORDERED="SELECT * FROM block_dedup_token ORDER BY id;" + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS block_dedup_token SYNC" +$CLICKHOUSE_CLIENT --query="CREATE TABLE block_dedup_token (id Int32) ENGINE=MergeTree() ORDER BY id SETTINGS non_replicated_deduplication_window=0xFFFFFFFF;" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert 2 blocks with dedup token, 1 row per block'" +DEDUP_TOKEN='dedup1' +echo 'INSERT INTO block_dedup_token VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert deduplicated by token'" +echo 'INSERT INTO block_dedup_token VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert the same data by providing different dedup token'" +DEDUP_TOKEN='dedup2' +echo 'INSERT INTO block_dedup_token VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert 4 blocks, 2 deduplicated, 2 inserted'" +echo 'INSERT INTO block_dedup_token VALUES (1), (2), (3), (4)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'disable token based deduplication, insert the same data as with token'" +DEDUP_TOKEN='' +echo 'INSERT INTO block_dedup_token VALUES (1), (2), (3), (4)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="DROP TABLE block_dedup_token SYNC" diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.reference b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.reference new file mode 100644 index 00000000000..5cf6230fd85 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.reference @@ -0,0 +1,34 @@ +insert 2 blocks with dedup token, 1 row per block +2 +1 +2 +insert deduplicated by token +2 +1 +2 +insert the same data by providing different dedup token +4 +1 +1 +2 +2 +insert 4 blocks, 2 deduplicated, 2 inserted +6 +1 +1 +2 +2 +3 +4 +disable token based deduplication, insert the same data as with token +10 +1 +1 +1 +2 +2 +2 +3 +3 +4 +4 diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh new file mode 100755 index 00000000000..928defd329f --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +QUERY_COUNT_ORIGIN_BLOCKS="SELECT COUNT(*) FROM system.parts WHERE database = currentDatabase() AND table = 'block_dedup_token_replica' AND min_block_number == max_block_number;" +QUERY_SELECT_FROM_TABLE_ORDERED="SELECT * FROM block_dedup_token_replica ORDER BY id;" + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS block_dedup_token_replica SYNC" +$CLICKHOUSE_CLIENT --query="CREATE TABLE block_dedup_token_replica (id Int32) ENGINE=ReplicatedMergeTree('/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{table}', '{replica}') ORDER BY id" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert 2 blocks with dedup token, 1 row per block'" +DEDUP_TOKEN='dedup1' +echo 'INSERT INTO block_dedup_token_replica VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert deduplicated by token'" +echo 'INSERT INTO block_dedup_token_replica VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert the same data by providing different dedup token'" +DEDUP_TOKEN='dedup2' +echo 'INSERT INTO block_dedup_token_replica VALUES (1), (2)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'insert 4 blocks, 2 deduplicated, 2 inserted'" +echo 'INSERT INTO block_dedup_token_replica VALUES (1), (2), (3), (4)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="SELECT 'disable token based deduplication, insert the same data as with token'" +DEDUP_TOKEN='' +echo 'INSERT INTO block_dedup_token_replica VALUES (1), (2), (3), (4)' | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_insert_block_size=1&min_insert_block_size_rows=0&min_insert_block_size_bytes=0&insert_deduplication_token='$DEDUP_TOKEN'&query=" --data-binary @- +$CLICKHOUSE_CLIENT --multiquery --query "$QUERY_COUNT_ORIGIN_BLOCKS;$QUERY_SELECT_FROM_TABLE_ORDERED" + +$CLICKHOUSE_CLIENT --query="DROP TABLE block_dedup_token_replica SYNC" diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_replica.reference b/tests/queries/0_stateless/02124_insert_deduplication_token_replica.reference new file mode 100644 index 00000000000..27691557d46 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_replica.reference @@ -0,0 +1,24 @@ +create replica 1 and check deduplication +two inserts with exact data, one inserted, one deduplicated by data digest +1 1001 +two inserts with the same dedup token, one inserted, one deduplicated by the token +1 1001 +1 1001 +reset deduplication token and insert new row +1 1001 +1 1001 +2 1002 +create replica 2 and check deduplication +inserted value deduplicated by data digest, the same result as before +1 1001 +1 1001 +2 1002 +inserted value deduplicated by dedup token, the same result as before +1 1001 +1 1001 +2 1002 +new record inserted by providing new deduplication token +1 1001 +1 1001 +2 1002 +2 1002 diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_replica.sql b/tests/queries/0_stateless/02124_insert_deduplication_token_replica.sql new file mode 100644 index 00000000000..47f33a4a971 --- /dev/null +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_replica.sql @@ -0,0 +1,49 @@ +-- insert data duplicates by providing deduplication token on insert + +DROP TABLE IF EXISTS insert_dedup_token1 SYNC; +DROP TABLE IF EXISTS insert_dedup_token2 SYNC; + +select 'create replica 1 and check deduplication'; +CREATE TABLE insert_dedup_token1 ( + id Int32, val UInt32 +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/insert_dedup_token', 'r1') ORDER BY id; + +select 'two inserts with exact data, one inserted, one deduplicated by data digest'; +INSERT INTO insert_dedup_token1 VALUES(1, 1001); +INSERT INTO insert_dedup_token1 VALUES(1, 1001); +SELECT * FROM insert_dedup_token1 ORDER BY id; + +select 'two inserts with the same dedup token, one inserted, one deduplicated by the token'; +set insert_deduplication_token = '1'; +INSERT INTO insert_dedup_token1 VALUES(1, 1001); +INSERT INTO insert_dedup_token1 VALUES(2, 1002); +SELECT * FROM insert_dedup_token1 ORDER BY id; + +select 'reset deduplication token and insert new row'; +set insert_deduplication_token = ''; +INSERT INTO insert_dedup_token1 VALUES(2, 1002); +SELECT * FROM insert_dedup_token1 ORDER BY id; + +select 'create replica 2 and check deduplication'; +CREATE TABLE insert_dedup_token2 ( + id Int32, val UInt32 +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/insert_dedup_token', 'r2') ORDER BY id; +SYSTEM SYNC REPLICA insert_dedup_token2; + +select 'inserted value deduplicated by data digest, the same result as before'; +set insert_deduplication_token = ''; +INSERT INTO insert_dedup_token2 VALUES(1, 1001); -- deduplicated by data digest +SELECT * FROM insert_dedup_token2 ORDER BY id; + +select 'inserted value deduplicated by dedup token, the same result as before'; +set insert_deduplication_token = '1'; +INSERT INTO insert_dedup_token2 VALUES(3, 1003); -- deduplicated by dedup token +SELECT * FROM insert_dedup_token2 ORDER BY id; + +select 'new record inserted by providing new deduplication token'; +set insert_deduplication_token = '2'; +INSERT INTO insert_dedup_token2 VALUES(2, 1002); -- inserted +SELECT * FROM insert_dedup_token2 ORDER BY id; + +DROP TABLE insert_dedup_token1 SYNC; +DROP TABLE insert_dedup_token2 SYNC; diff --git a/tests/queries/0_stateless/02125_lz4_compression_bug.reference b/tests/queries/0_stateless/02125_lz4_compression_bug.reference new file mode 100644 index 00000000000..713767e1d44 --- /dev/null +++ b/tests/queries/0_stateless/02125_lz4_compression_bug.reference @@ -0,0 +1,45 @@ +Native +9999 +99999 +999999 +2499999 +Values +9999 +99999 +999999 +2499999 +JSONCompactEachRow +9999 +99999 +999999 +2499999 +TSKV +9999 +99999 +999999 +2499999 +TSV +9999 +99999 +999999 +2499999 +CSV +9999 +99999 +999999 +2499999 +JSONEachRow +9999 +99999 +999999 +2499999 +JSONCompactEachRow +9999 +99999 +999999 +2499999 +JSONStringsEachRow +9999 +99999 +999999 +2499999 diff --git a/tests/queries/0_stateless/02125_lz4_compression_bug.sh b/tests/queries/0_stateless/02125_lz4_compression_bug.sh new file mode 100755 index 00000000000..4fddb46658e --- /dev/null +++ b/tests/queries/0_stateless/02125_lz4_compression_bug.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +for format in Native Values JSONCompactEachRow TSKV TSV CSV JSONEachRow JSONCompactEachRow JSONStringsEachRow +do + echo $format + ${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS file" + ${CLICKHOUSE_CLIENT} --query "CREATE TABLE file (x UInt64) ENGINE = File($format, '${CLICKHOUSE_DATABASE}/data.$format.lz4')" + for size in 10000 100000 1000000 2500000 + do + ${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE file" + ${CLICKHOUSE_CLIENT} --query "INSERT INTO file SELECT * FROM numbers($size)" + ${CLICKHOUSE_CLIENT} --query "SELECT max(x) FROM file" + done +done + +${CLICKHOUSE_CLIENT} --query "DROP TABLE file" diff --git a/tests/queries/0_stateless/02128_hex_bin_on_uuid.reference b/tests/queries/0_stateless/02128_hex_bin_on_uuid.reference new file mode 100644 index 00000000000..32d25b95178 --- /dev/null +++ b/tests/queries/0_stateless/02128_hex_bin_on_uuid.reference @@ -0,0 +1,5 @@ +32 +1 +0000000080e746f800009d773a2fd319 +128 +00000000000000000000000000000000100000001110011101000110111110000000000000000000100111010111011100111010001011111101001100011001 diff --git a/tests/queries/0_stateless/02128_hex_bin_on_uuid.sql b/tests/queries/0_stateless/02128_hex_bin_on_uuid.sql new file mode 100644 index 00000000000..30c0c4b7629 --- /dev/null +++ b/tests/queries/0_stateless/02128_hex_bin_on_uuid.sql @@ -0,0 +1,16 @@ +-- length should be 32 +select length(hex(generateUUIDv4())); + +with generateUUIDv4() as uuid, + replace(toString(uuid), '-', '') as str1, + lower(hex(uuid)) as str2 +select str1 = str2; + +-- hex on UUID always generate 32 characters even there're leading zeros +select lower(hex(toUUID('00000000-80e7-46f8-0000-9d773a2fd319'))); + +-- length should be 128 +select length(bin(generateUUIDv4())); + +-- bin on UUID always generate 128 characters even there're leading zeros +select bin(toUUID('00000000-80e7-46f8-0000-9d773a2fd319')); \ No newline at end of file diff --git a/tests/queries/0_stateless/02133_classification.reference b/tests/queries/0_stateless/02133_classification.reference new file mode 100644 index 00000000000..92ec5661f56 --- /dev/null +++ b/tests/queries/0_stateless/02133_classification.reference @@ -0,0 +1,15 @@ +ru +en +fr +ja +zh +un +{'ja':0.62,'fr':0.36} +{'ko':0.98} +{} +ISO-8859-1 +en +0.465 +-0.28823888 +0.050505556 +C++ diff --git a/tests/queries/0_stateless/02133_classification.sql b/tests/queries/0_stateless/02133_classification.sql new file mode 100644 index 00000000000..7788ece286a --- /dev/null +++ b/tests/queries/0_stateless/02133_classification.sql @@ -0,0 +1,23 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: depends on cld2 and nlp-data + +SET allow_experimental_nlp_functions = 1; + +SELECT detectLanguage('Они сошлись. Волна и камень, Стихи и проза, лед и пламень, Не столь различны меж собой.'); +SELECT detectLanguage('Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.'); +SELECT detectLanguage('A vaincre sans peril, on triomphe sans gloire.'); +SELECT detectLanguage('二兎を追う者は一兎をも得ず'); +SELECT detectLanguage('有情饮水饱,无情食饭饥。'); +SELECT detectLanguage('*****///// _____ ,,,,,,,, .....'); +SELECT detectLanguageMixed('二兎を追う者は一兎をも得ず二兎を追う者は一兎をも得ず A vaincre sans peril, on triomphe sans gloire.'); +SELECT detectLanguageMixed('어디든 가치가 있는 곳으로 가려면 지름길은 없다'); +SELECT detectLanguageMixed('*****///// _____ ,,,,,,,, .....'); + +SELECT detectCharset('Plain English'); +SELECT detectLanguageUnknown('Plain English'); + +SELECT detectTonality('милая кошка'); +SELECT detectTonality('ненависть к людям'); +SELECT detectTonality('обычная прогулка по ближайшему парку'); + +SELECT detectProgrammingLanguage('#include '); diff --git a/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.reference b/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.reference new file mode 100644 index 00000000000..beeb89a5947 --- /dev/null +++ b/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.reference @@ -0,0 +1,100 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 diff --git a/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.sql b/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.sql new file mode 100644 index 00000000000..7947536bc79 --- /dev/null +++ b/tests/queries/0_stateless/02155_multiple_inserts_for_formats_with_suffix.sql @@ -0,0 +1,39 @@ +-- Tags: no-fasttest, no-parallel + +drop table if exists test; +create table test (number UInt64) engine=File('Parquet'); +insert into test select * from numbers(10); +insert into test select * from numbers(10, 10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into test select * from numbers(10, 10) settings engine_file_allow_create_multiple_files=1; +select * from test order by number; +truncate table test; +drop table test; + +create table test (number UInt64) engine=File('Parquet', 'test_02155/test1/data.Parquet'); +insert into test select * from numbers(10) settings engine_file_truncate_on_insert=1; +insert into test select * from numbers(10, 10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into test select * from numbers(10, 10) settings engine_file_allow_create_multiple_files=1; +select * from test order by number; +drop table test; + + +insert into table function file(concat(currentDatabase(), '/test2/data.Parquet'), 'Parquet', 'number UInt64') select * from numbers(10) settings engine_file_truncate_on_insert=1; +insert into table function file(concat(currentDatabase(), '/test2/data.Parquet'), 'Parquet', 'number UInt64') select * from numbers(10, 10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into table function file(concat(currentDatabase(), '/test2/data.Parquet'), 'Parquet', 'number UInt64') select * from numbers(10, 10) settings engine_file_allow_create_multiple_files=1; +select * from file(concat(currentDatabase(), '/test2/data.Parquet'), 'Parquet', 'number UInt64'); +select * from file(concat(currentDatabase(), '/test2/data.1.Parquet'), 'Parquet', 'number UInt64'); + +create table test (number UInt64) engine=File('Parquet', 'test_02155/test3/data.Parquet.gz'); +insert into test select * from numbers(10) settings engine_file_truncate_on_insert=1; +; +insert into test select * from numbers(10, 10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into test select * from numbers(10, 10) settings engine_file_allow_create_multiple_files=1; +select * from test order by number; +drop table test; + +insert into table function file(concat(currentDatabase(), '/test4/data.Parquet.gz'), 'Parquet', 'number UInt64') select * from numbers(10) settings engine_file_truncate_on_insert=1; +insert into table function file(concat(currentDatabase(), '/test4/data.Parquet.gz'), 'Parquet', 'number UInt64') select * from numbers(10, 10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into table function file(concat(currentDatabase(), '/test4/data.Parquet.gz'), 'Parquet', 'number UInt64') select * from numbers(10, 10) settings engine_file_allow_create_multiple_files=1; +select * from file(concat(currentDatabase(), '/test4/data.Parquet.gz'), 'Parquet', 'number UInt64'); +select * from file(concat(currentDatabase(), '/test4/data.1.Parquet.gz'), 'Parquet', 'number UInt64'); + diff --git a/tests/queries/0_stateless/02160_client_autocomplete_parse_query.expect b/tests/queries/0_stateless/02160_client_autocomplete_parse_query.expect new file mode 100755 index 00000000000..244e48226e5 --- /dev/null +++ b/tests/queries/0_stateless/02160_client_autocomplete_parse_query.expect @@ -0,0 +1,86 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 60 +set uuid "" +match_max 100000 +expect_after { + # Do not ignore eof from read. + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } +} + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT" +expect ":) " + +# Make a query +send -- "set max_distributed" +expect "set max_distributed" + +# Wait for suggestions to load, they are loaded in background +set is_done 0 +set timeout 1 +while {$is_done == 0} { + send -- "\t" + expect { + "_" { + set is_done 1 + } + default { + # Reset the expect_after + } + } +} +set timeout 60 +# Ctrl-C +send -- "\3" +expect ":) " + +# Generate UIUD to avoid matching old database/tables/columns from previous test runs. +send -- "select 'begin-' || replace(toString(generateUUIDv4()), '-', '') || '-end' format TSV\r" +expect -re TSV.*TSV.*begin-(.*)-end.* +set uuid $expect_out(1,string) +expect ":) " + +# Create +send -- "create database new_${uuid}_database\r" +expect ":) " +send -- "create table new_${uuid}_table (new_${uuid}_column Int) engine=Null()\r" +expect ":) " + +# Check completion +send -- "new_${uuid}_data" +expect "new_${uuid}_data" +send -- "\t" +expect "base" +# Ctrl-C +send -- "\3" +expect ":) " + +send -- "new_${uuid}_ta" +expect "new_${uuid}_ta" +send -- "\t" +expect "ble" +# Ctrl-C +send -- "\3" +expect ":) " + +send -- "new_${uuid}_col" +expect "new_${uuid}_col" +send -- "\t" +expect "umn" +# Ctrl-C +send -- "\3" +expect ":) " + +# Cleanup +send -- "drop database new_${uuid}_database\r" +expect ":) " +send -- "drop table new_${uuid}_table\r" +expect ":) " + +# Ctrl-D +send -- "\4" +expect eof diff --git a/tests/queries/0_stateless/02160_client_autocomplete_parse_query.reference b/tests/queries/0_stateless/02160_client_autocomplete_parse_query.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02163_shard_num.reference b/tests/queries/0_stateless/02163_shard_num.reference new file mode 100644 index 00000000000..a109d5d2b6b --- /dev/null +++ b/tests/queries/0_stateless/02163_shard_num.reference @@ -0,0 +1,17 @@ +-- { echo } +SELECT shardNum() AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY _shard_num; +2 1 +1 1 +SELECT shardNum() AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY shard_num; +2 1 +1 1 +SELECT _shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY _shard_num; +2 1 +1 1 +SELECT _shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY shard_num; +2 1 +1 1 +SELECT a._shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) a GROUP BY shard_num; +2 1 +1 1 +SELECT _shard_num FROM remote('127.1', system.one) AS a INNER JOIN (SELECT _shard_num FROM system.one) AS b USING (dummy); -- { serverError UNKNOWN_IDENTIFIER } diff --git a/tests/queries/0_stateless/02163_shard_num.sql b/tests/queries/0_stateless/02163_shard_num.sql new file mode 100644 index 00000000000..27d40b3c976 --- /dev/null +++ b/tests/queries/0_stateless/02163_shard_num.sql @@ -0,0 +1,7 @@ +-- { echo } +SELECT shardNum() AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY _shard_num; +SELECT shardNum() AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY shard_num; +SELECT _shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY _shard_num; +SELECT _shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) GROUP BY shard_num; +SELECT a._shard_num AS shard_num, sum(1) as rows FROM remote('127.{1,2}', system, one) a GROUP BY shard_num; +SELECT _shard_num FROM remote('127.1', system.one) AS a INNER JOIN (SELECT _shard_num FROM system.one) AS b USING (dummy); -- { serverError UNKNOWN_IDENTIFIER } diff --git a/tests/queries/0_stateless/02165_h3_edge_length_km.reference b/tests/queries/0_stateless/02165_h3_edge_length_km.reference new file mode 100644 index 00000000000..95380a2a80e --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_edge_length_km.reference @@ -0,0 +1,16 @@ +1107.712591 +418.6760055 +158.2446558 +59.81085794 +22.6063794 +8.544408276 +3.229482772 +1.220629759 +0.461354684 +0.174375668 +0.065907807 +0.024910561 +0.009415526 +0.003559893 +0.001348575 +0.000509713 diff --git a/tests/queries/0_stateless/02165_h3_edge_length_km.sql b/tests/queries/0_stateless/02165_h3_edge_length_km.sql new file mode 100644 index 00000000000..e67b691ef66 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_edge_length_km.sql @@ -0,0 +1,18 @@ +-- Tags: no-fasttest + +SELECT h3EdgeLengthKm(0); +SELECT h3EdgeLengthKm(1); +SELECT h3EdgeLengthKm(2); +SELECT h3EdgeLengthKm(3); +SELECT h3EdgeLengthKm(4); +SELECT h3EdgeLengthKm(5); +SELECT h3EdgeLengthKm(6); +SELECT h3EdgeLengthKm(7); +SELECT h3EdgeLengthKm(8); +SELECT h3EdgeLengthKm(9); +SELECT h3EdgeLengthKm(10); +SELECT h3EdgeLengthKm(11); +SELECT h3EdgeLengthKm(12); +SELECT h3EdgeLengthKm(13); +SELECT h3EdgeLengthKm(14); +SELECT h3EdgeLengthKm(15); diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.reference b/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.reference new file mode 100644 index 00000000000..1e44981f9eb --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.reference @@ -0,0 +1,16 @@ +489.55559989912314 +192.39078306095627 +66.91913220366439 +1263.6096633631134 +480.7440319163875 +195.44963163407317 +1263.6096633631118 +461.80697194406935 +190.08769842412468 +1263.6096633631123 +465.41972260404145 +64.81970466298482 +1263.6096633631116 +69.63641611246636 +195.6274718146093 +67.66085681290775 diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.sql b/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.sql new file mode 100644 index 00000000000..26607227484 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_Km.sql @@ -0,0 +1,29 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + +CREATE TABLE h3_indexes (h3_index UInt64) ENGINE = Memory; + +-- Test h3 indices selected from original test fixture: https://github.com/uber/h3/blob/master/src/apps/testapps/testH3CellAreaExhaustive.c + +INSERT INTO h3_indexes VALUES (1298057039473278975); +INSERT INTO h3_indexes VALUES (1370114633511206911); +INSERT INTO h3_indexes VALUES (1442172227549134847); +INSERT INTO h3_indexes VALUES (1514229821587062783); +INSERT INTO h3_indexes VALUES (1232301846085763071); +INSERT INTO h3_indexes VALUES (1304359440123691007); +INSERT INTO h3_indexes VALUES (1376417034161618943); +INSERT INTO h3_indexes VALUES (1448474628199546879); +INSERT INTO h3_indexes VALUES (1598506838100279295); +INSERT INTO h3_indexes VALUES (1238219417666453503); +INSERT INTO h3_indexes VALUES (1310277011704381439); +INSERT INTO h3_indexes VALUES (1382334605742309375); +INSERT INTO h3_indexes VALUES (1458182628678041599); +INSERT INTO h3_indexes VALUES (1530240222715969535); +INSERT INTO h3_indexes VALUES (1602297816753897471); +INSERT INTO h3_indexes VALUES (1242009915283734527); + +SELECT h3ExactEdgeLengthKm(h3_index) FROM h3_indexes ORDER BY h3_index; + +DROP TABLE h3_indexes; + diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_m.reference b/tests/queries/0_stateless/02165_h3_exact_edge_length_m.reference new file mode 100644 index 00000000000..52dcaaf8548 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_m.reference @@ -0,0 +1,16 @@ +489555.59989912313 +192390.78306095628 +66919.13220366438 +1263609.6633631135 +480744.0319163875 +195449.63163407316 +1263609.663363112 +461806.9719440694 +190087.69842412468 +1263609.6633631124 +465419.72260404145 +64819.70466298482 +1263609.6633631117 +69636.41611246637 +195627.4718146093 +67660.85681290775 diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_m.sql b/tests/queries/0_stateless/02165_h3_exact_edge_length_m.sql new file mode 100644 index 00000000000..093ab1dd2d2 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_m.sql @@ -0,0 +1,29 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + +CREATE TABLE h3_indexes (h3_index UInt64) ENGINE = Memory; + +-- Test h3 indices selected from original test fixture: https://github.com/uber/h3/blob/master/src/apps/testapps/testH3CellAreaExhaustive.c + +INSERT INTO h3_indexes VALUES (1298057039473278975); +INSERT INTO h3_indexes VALUES (1370114633511206911); +INSERT INTO h3_indexes VALUES (1442172227549134847); +INSERT INTO h3_indexes VALUES (1514229821587062783); +INSERT INTO h3_indexes VALUES (1232301846085763071); +INSERT INTO h3_indexes VALUES (1304359440123691007); +INSERT INTO h3_indexes VALUES (1376417034161618943); +INSERT INTO h3_indexes VALUES (1448474628199546879); +INSERT INTO h3_indexes VALUES (1598506838100279295); +INSERT INTO h3_indexes VALUES (1238219417666453503); +INSERT INTO h3_indexes VALUES (1310277011704381439); +INSERT INTO h3_indexes VALUES (1382334605742309375); +INSERT INTO h3_indexes VALUES (1458182628678041599); +INSERT INTO h3_indexes VALUES (1530240222715969535); +INSERT INTO h3_indexes VALUES (1602297816753897471); +INSERT INTO h3_indexes VALUES (1242009915283734527); + +SELECT h3ExactEdgeLengthM(h3_index) FROM h3_indexes ORDER BY h3_index; + +DROP TABLE h3_indexes; + diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.reference b/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.reference new file mode 100644 index 00000000000..cec63f72b07 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.reference @@ -0,0 +1,16 @@ +0.07684116278590451 +0.03019786002394998 +0.010503697500779932 +0.19833750417794152 +0.07545808979092708 +0.030677980118976447 +0.19833750417794127 +0.0724857089044268 +0.029836365432681984 +0.19833750417794133 +0.07305277005463119 +0.010174169141909536 +0.19833750417794122 +0.010930205246202099 +0.030705894101096694 +0.010620119376973209 diff --git a/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.sql b/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.sql new file mode 100644 index 00000000000..d618e69f032 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_exact_edge_length_rads.sql @@ -0,0 +1,29 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + +CREATE TABLE h3_indexes (h3_index UInt64) ENGINE = Memory; + +-- Test h3 indices selected from original test fixture: https://github.com/uber/h3/blob/master/src/apps/testapps/testH3CellAreaExhaustive.c + +INSERT INTO h3_indexes VALUES (1298057039473278975); +INSERT INTO h3_indexes VALUES (1370114633511206911); +INSERT INTO h3_indexes VALUES (1442172227549134847); +INSERT INTO h3_indexes VALUES (1514229821587062783); +INSERT INTO h3_indexes VALUES (1232301846085763071); +INSERT INTO h3_indexes VALUES (1304359440123691007); +INSERT INTO h3_indexes VALUES (1376417034161618943); +INSERT INTO h3_indexes VALUES (1448474628199546879); +INSERT INTO h3_indexes VALUES (1598506838100279295); +INSERT INTO h3_indexes VALUES (1238219417666453503); +INSERT INTO h3_indexes VALUES (1310277011704381439); +INSERT INTO h3_indexes VALUES (1382334605742309375); +INSERT INTO h3_indexes VALUES (1458182628678041599); +INSERT INTO h3_indexes VALUES (1530240222715969535); +INSERT INTO h3_indexes VALUES (1602297816753897471); +INSERT INTO h3_indexes VALUES (1242009915283734527); + +SELECT h3ExactEdgeLengthRads(h3_index) FROM h3_indexes ORDER BY h3_index; + +DROP TABLE h3_indexes; + diff --git a/tests/queries/0_stateless/02165_h3_num_hexagons.reference b/tests/queries/0_stateless/02165_h3_num_hexagons.reference new file mode 100644 index 00000000000..b6cfe94c218 --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_num_hexagons.reference @@ -0,0 +1,16 @@ +122 +842 +5882 +41162 +288122 +2016842 +14117882 +98825162 +691776122 +4842432842 +33897029882 +237279209162 +1660954464122 +11626681248842 +81386768741882 +569707381193162 diff --git a/tests/queries/0_stateless/02165_h3_num_hexagons.sql b/tests/queries/0_stateless/02165_h3_num_hexagons.sql new file mode 100644 index 00000000000..7ab48b3738b --- /dev/null +++ b/tests/queries/0_stateless/02165_h3_num_hexagons.sql @@ -0,0 +1,19 @@ +-- Tags: no-fasttest + +SELECT h3NumHexagons(0); +SELECT h3NumHexagons(1); +SELECT h3NumHexagons(2); +SELECT h3NumHexagons(3); +SELECT h3NumHexagons(4); +SELECT h3NumHexagons(5); +SELECT h3NumHexagons(6); +SELECT h3NumHexagons(7); +SELECT h3NumHexagons(8); +SELECT h3NumHexagons(9); +SELECT h3NumHexagons(10); +SELECT h3NumHexagons(11); +SELECT h3NumHexagons(12); +SELECT h3NumHexagons(13); +SELECT h3NumHexagons(14); +SELECT h3NumHexagons(15); +SELECT h3NumHexagons(16); -- { serverError 69 } diff --git a/tests/queries/0_stateless/02168_avro_bug.sql b/tests/queries/0_stateless/02168_avro_bug.sql index 78eedf3258e..ac98119845f 100644 --- a/tests/queries/0_stateless/02168_avro_bug.sql +++ b/tests/queries/0_stateless/02168_avro_bug.sql @@ -1,5 +1,5 @@ --- Tags: no-fasttest -insert into table function file('data.avro', 'Avro', 'x UInt64') select * from numbers(10); -insert into table function file('data.avro', 'Avro', 'x UInt64') select * from numbers(10); -insert into table function file('data.avro', 'Avro', 'x UInt64') select * from numbers(10); +-- Tags: no-fasttest, no-parallel +insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); +insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } select 'OK'; diff --git a/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.reference b/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.reference new file mode 100644 index 00000000000..645cec31b47 --- /dev/null +++ b/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.reference @@ -0,0 +1,8 @@ +-- { echoOn } + +-- regression for optimize_aggregation_in_order with empty result set +-- that cause at first +-- "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" +-- at first and after +-- "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" +select count() from remote('127.{1,2}', currentDatabase(), data_02176) where key = 0 group by key settings optimize_aggregation_in_order=1; diff --git a/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.sql b/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.sql new file mode 100644 index 00000000000..a86fd4357c8 --- /dev/null +++ b/tests/queries/0_stateless/02176_optimize_aggregation_in_order_empty.sql @@ -0,0 +1,14 @@ +drop table if exists data_02176; +create table data_02176 (key Int) Engine=MergeTree() order by key; + +-- { echoOn } + +-- regression for optimize_aggregation_in_order with empty result set +-- that cause at first +-- "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" +-- at first and after +-- "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" +select count() from remote('127.{1,2}', currentDatabase(), data_02176) where key = 0 group by key settings optimize_aggregation_in_order=1; + +-- { echoOff } +drop table data_02176; diff --git a/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.reference b/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.reference new file mode 100644 index 00000000000..00e893213c0 --- /dev/null +++ b/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.reference @@ -0,0 +1,6 @@ +-- { echoOn } + +-- regression for optimize_aggregation_in_order +-- that cause "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" error +select count() from remote('127.{1,2}', currentDatabase(), data_02177) group by key settings optimize_aggregation_in_order=1; +2 diff --git a/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.sql b/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.sql new file mode 100644 index 00000000000..17c4a1dba29 --- /dev/null +++ b/tests/queries/0_stateless/02177_merge_optimize_aggregation_in_order.sql @@ -0,0 +1,12 @@ +drop table if exists data_02177; +create table data_02177 (key Int) Engine=MergeTree() order by key; +insert into data_02177 values (1); + +-- { echoOn } + +-- regression for optimize_aggregation_in_order +-- that cause "Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform" error +select count() from remote('127.{1,2}', currentDatabase(), data_02177) group by key settings optimize_aggregation_in_order=1; + +-- { echoOff } +drop table data_02177; diff --git a/tests/queries/0_stateless/02177_sum_if_not_found.reference b/tests/queries/0_stateless/02177_sum_if_not_found.reference new file mode 100644 index 00000000000..bb0b1cf658d --- /dev/null +++ b/tests/queries/0_stateless/02177_sum_if_not_found.reference @@ -0,0 +1,3 @@ +0 +0 +0 diff --git a/tests/queries/0_stateless/02177_sum_if_not_found.sql b/tests/queries/0_stateless/02177_sum_if_not_found.sql new file mode 100644 index 00000000000..c888f8b39aa --- /dev/null +++ b/tests/queries/0_stateless/02177_sum_if_not_found.sql @@ -0,0 +1,35 @@ +SELECT sumIf(1, 0); +SELECT SumIf(1, 0); +SELECT sUmIf(1, 0); +SELECT sumIF(1, 0); -- { serverError 46 } + +DROP TABLE IF EXISTS data; +DROP TABLE IF EXISTS agg; + +CREATE TABLE data +( + `n` UInt32, + `t` DateTime +) +ENGINE = Null; + +CREATE TABLE agg +ENGINE = AggregatingMergeTree +ORDER BY tuple() AS +SELECT + t, + sumIF(n, 0) +FROM data +GROUP BY t; -- { serverError 46} + +CREATE TABLE agg +ENGINE = AggregatingMergeTree +ORDER BY tuple() AS +SELECT + t, + sumIf(n, 0) +FROM data +GROUP BY t; + +DROP TABLE data; +DROP TABLE agg; diff --git a/tests/queries/0_stateless/02179_map_cast_to_array.reference b/tests/queries/0_stateless/02179_map_cast_to_array.reference new file mode 100644 index 00000000000..c1870e78bb7 --- /dev/null +++ b/tests/queries/0_stateless/02179_map_cast_to_array.reference @@ -0,0 +1,8 @@ +{1:'Test'} [(1,'Test')] [(1,'Test')] +{1:'1234'} [(1,1234)] [(1,1234)] +{1:[1,2,3]} [(1,['1','2','3'])] [(1,['1','2','3'])] +{1:['1','2','3']} [(1,[1,2,3])] [(1,[1,2,3])] +{1:{1:'1234'}} [(1,{1:'1234'})] [(1,{1:'1234'})] +{1:{1:'1234'}} [(1,{1:1234})] [(1,{1:1234})] +{1:{1:'1234'}} [(1,[(1,'1234')])] [(1,[(1,'1234')])] +{1:{1:'1234'}} [(1,[(1,1234)])] [(1,[(1,1234)])] diff --git a/tests/queries/0_stateless/02179_map_cast_to_array.sql b/tests/queries/0_stateless/02179_map_cast_to_array.sql new file mode 100644 index 00000000000..b1320d7a43c --- /dev/null +++ b/tests/queries/0_stateless/02179_map_cast_to_array.sql @@ -0,0 +1,26 @@ +WITH map(1, 'Test') AS value, 'Array(Tuple(UInt64, String))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, 'Test') AS value, 'Array(Tuple(UInt64, UInt64))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); --{serverError 6} + +WITH map(1, '1234') AS value, 'Array(Tuple(UInt64, UInt64))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, [1, 2, 3]) AS value, 'Array(Tuple(UInt64, Array(String)))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, ['1', '2', '3']) AS value, 'Array(Tuple(UInt64, Array(UInt64)))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, map(1, '1234')) AS value, 'Array(Tuple(UInt64, Map(UInt64, String)))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, map(1, '1234')) AS value, 'Array(Tuple(UInt64, Map(UInt64, UInt64)))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, map(1, '1234')) AS value, 'Array(Tuple(UInt64, Array(Tuple(UInt64, String))))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); + +WITH map(1, map(1, '1234')) as value, 'Array(Tuple(UInt64, Array(Tuple(UInt64, UInt64))))' AS type +SELECT value, cast(value, type), cast(materialize(value), type); diff --git a/tests/queries/0_stateless/02179_range_hashed_dictionary_invalid_interval.reference b/tests/queries/0_stateless/02179_range_hashed_dictionary_invalid_interval.reference index d7753418087..c1ffcc3bb0f 100644 --- a/tests/queries/0_stateless/02179_range_hashed_dictionary_invalid_interval.reference +++ b/tests/queries/0_stateless/02179_range_hashed_dictionary_invalid_interval.reference @@ -3,3 +3,5 @@ DefaultValue 1 0 0 15 20 Value +0 10 0 Value +0 15 10 Value diff --git a/tests/queries/0_stateless/02179_sparse_columns_detach.reference b/tests/queries/0_stateless/02179_sparse_columns_detach.reference new file mode 100644 index 00000000000..2f9714f7a97 --- /dev/null +++ b/tests/queries/0_stateless/02179_sparse_columns_detach.reference @@ -0,0 +1,12 @@ +1000 +id Default +s Sparse +1000 +id Default +s Sparse +1000 +id Default +s Sparse +1000 +id Default +s Sparse diff --git a/tests/queries/0_stateless/02179_sparse_columns_detach.sql b/tests/queries/0_stateless/02179_sparse_columns_detach.sql new file mode 100644 index 00000000000..4720e6720ba --- /dev/null +++ b/tests/queries/0_stateless/02179_sparse_columns_detach.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS t_sparse_detach; + +CREATE TABLE t_sparse_detach(id UInt64, s String) +ENGINE = MergeTree ORDER BY id +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9; + +INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000); +INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000); + +OPTIMIZE TABLE t_sparse_detach FINAL; + +SELECT count() FROM t_sparse_detach WHERE s != ''; + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active +ORDER BY column; + +DETACH TABLE t_sparse_detach; +ATTACH TABLE t_sparse_detach; + +SELECT count() FROM t_sparse_detach WHERE s != ''; + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active +ORDER BY column; + +TRUNCATE TABLE t_sparse_detach; + +ALTER TABLE t_sparse_detach + MODIFY SETTING vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1; + +INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000); +INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000); + +OPTIMIZE TABLE t_sparse_detach FINAL; + +SELECT count() FROM t_sparse_detach WHERE s != ''; + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active +ORDER BY column; + +DETACH TABLE t_sparse_detach; +ATTACH TABLE t_sparse_detach; + +SELECT count() FROM t_sparse_detach WHERE s != ''; + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active +ORDER BY column; + +DROP TABLE t_sparse_detach; diff --git a/tests/queries/0_stateless/02181_dictionary_attach_detach.reference b/tests/queries/0_stateless/02181_dictionary_attach_detach.reference new file mode 100644 index 00000000000..6a9fb68a92e --- /dev/null +++ b/tests/queries/0_stateless/02181_dictionary_attach_detach.reference @@ -0,0 +1,2 @@ +0 Value +0 Value diff --git a/tests/queries/0_stateless/02181_dictionary_attach_detach.sql b/tests/queries/0_stateless/02181_dictionary_attach_detach.sql new file mode 100644 index 00000000000..fb7a2aa71fb --- /dev/null +++ b/tests/queries/0_stateless/02181_dictionary_attach_detach.sql @@ -0,0 +1,39 @@ +DROP TABLE IF EXISTS 02181_test_table; +CREATE TABLE 02181_test_table +( + id UInt64, + value String +) +ENGINE = TinyLog; + +INSERT INTO 02181_test_table VALUES (0, 'Value'); + +DROP DICTIONARY IF EXISTS 02181_test_dictionary; +CREATE DICTIONARY 02181_test_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02181_test_table')) +LAYOUT(HASHED()) +LIFETIME(0); + +DETACH TABLE 02181_test_dictionary; --{serverError 520} +ATTACH TABLE 02181_test_dictionary; --{serverError 80} + +DETACH DICTIONARY 02181_test_dictionary; +ATTACH DICTIONARY 02181_test_dictionary; + +SELECT * FROM 02181_test_dictionary; + +DETACH DICTIONARY 02181_test_dictionary; +ATTACH DICTIONARY 02181_test_dictionary; + +SELECT * FROM 02181_test_dictionary; + +DETACH DICTIONARY 02181_test_dictionary; +ATTACH DICTIONARY 02181_test_dictionary; + +DROP DICTIONARY 02181_test_dictionary; +DROP TABLE 02181_test_table; diff --git a/tests/queries/0_stateless/02181_format_from_file_extension_local.reference b/tests/queries/0_stateless/02181_format_from_file_extension_local.reference new file mode 100644 index 00000000000..8b1acc12b63 --- /dev/null +++ b/tests/queries/0_stateless/02181_format_from_file_extension_local.reference @@ -0,0 +1,10 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/02181_format_from_file_extension_local.sh b/tests/queries/0_stateless/02181_format_from_file_extension_local.sh new file mode 100755 index 00000000000..418640f5a70 --- /dev/null +++ b/tests/queries/0_stateless/02181_format_from_file_extension_local.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet" > $CLICKHOUSE_TMP/data.parquet +$CLICKHOUSE_LOCAL -q "select * from table" --file $CLICKHOUSE_TMP/data.parquet + diff --git a/tests/queries/0_stateless/02181_sql_user_defined_functions_invalid_lambda.reference b/tests/queries/0_stateless/02181_sql_user_defined_functions_invalid_lambda.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02181_sql_user_defined_functions_invalid_lambda.sql b/tests/queries/0_stateless/02181_sql_user_defined_functions_invalid_lambda.sql new file mode 100644 index 00000000000..c436394ab99 --- /dev/null +++ b/tests/queries/0_stateless/02181_sql_user_defined_functions_invalid_lambda.sql @@ -0,0 +1,4 @@ +CREATE FUNCTION 02181_invalid_lambda AS lambda(((x * 2) AS x_doubled) + x_doubled); --{serverError 1} +CREATE FUNCTION 02181_invalid_lambda AS lambda(x); --{serverError 1} +CREATE FUNCTION 02181_invalid_lambda AS lambda(); --{serverError 1} +CREATE FUNCTION 02181_invalid_lambda AS lambda(tuple(x)) --{serverError 1} diff --git a/tests/queries/0_stateless/02182_format_and_schema_from_stdin.reference b/tests/queries/0_stateless/02182_format_and_schema_from_stdin.reference new file mode 100644 index 00000000000..8b1acc12b63 --- /dev/null +++ b/tests/queries/0_stateless/02182_format_and_schema_from_stdin.reference @@ -0,0 +1,10 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/02182_format_and_schema_from_stdin.sh b/tests/queries/0_stateless/02182_format_and_schema_from_stdin.sh new file mode 100755 index 00000000000..555649718f8 --- /dev/null +++ b/tests/queries/0_stateless/02182_format_and_schema_from_stdin.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "select * from numbers(10) format Parquet" > $CLICKHOUSE_TMP/data.parquet +$CLICKHOUSE_LOCAL -q "select * from table" --file="-" < $CLICKHOUSE_TMP/data.parquet + diff --git a/tests/queries/0_stateless/02182_json_each_row_schema_inference.reference b/tests/queries/0_stateless/02182_json_each_row_schema_inference.reference new file mode 100644 index 00000000000..df8d5948241 --- /dev/null +++ b/tests/queries/0_stateless/02182_json_each_row_schema_inference.reference @@ -0,0 +1,6 @@ +0 +1 +2 +0 +1 +2 diff --git a/tests/queries/0_stateless/02182_json_each_row_schema_inference.sh b/tests/queries/0_stateless/02182_json_each_row_schema_inference.sh new file mode 100755 index 00000000000..59c0d56a086 --- /dev/null +++ b/tests/queries/0_stateless/02182_json_each_row_schema_inference.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +echo '[{"number":"0"} ,{"number":"1"} , {"number":"2"}]' > $CLICKHOUSE_TMP/02182_data +$CLICKHOUSE_LOCAL -q "SELECT * FROM table" --file $CLICKHOUSE_TMP/02182_data --input-format JSONEachRow + +echo '["0"] ,["1"] ; ["2"]' > $CLICKHOUSE_TMP/02182_data +$CLICKHOUSE_LOCAL -q "SELECT * FROM table" --file $CLICKHOUSE_TMP/02182_data --input-format JSONCompactEachRow + diff --git a/tests/queries/0_stateless/02183_array_tuple_literals_remote.reference b/tests/queries/0_stateless/02183_array_tuple_literals_remote.reference new file mode 100644 index 00000000000..1444ec4d2d9 --- /dev/null +++ b/tests/queries/0_stateless/02183_array_tuple_literals_remote.reference @@ -0,0 +1,11 @@ +[0] +(0,1) +[[0,1],[2,3]] +[(0,1),(2,3)] +[(0,1),(2,3)] +([0,1],(2,3),[4],(5,'a'),6,'b') +[0,1] +(0,1) +[[0,1],[2,3]] +[[0,1],[0,0]] +[[[0]],[[1],[2,3]]] diff --git a/tests/queries/0_stateless/02183_array_tuple_literals_remote.sql b/tests/queries/0_stateless/02183_array_tuple_literals_remote.sql new file mode 100644 index 00000000000..25c7e7d7348 --- /dev/null +++ b/tests/queries/0_stateless/02183_array_tuple_literals_remote.sql @@ -0,0 +1,11 @@ +SELECT any(array(0)) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(tuple(0, 1)) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array(array(0, 1), array(2, 3))) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array(tuple(0, 1), tuple(2, 3))) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array((0, 1), (2, 3))) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(tuple(array(0, 1), tuple(2, 3), [4], (5, 'a'), 6, 'b')) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array(number, 1)) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(tuple(number, 1)) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array(array(0, 1), [2, 3])) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any(array(array(0, 1), [number, number])) AS k FROM remote('127.0.0.{1,2}', numbers(10)); +SELECT any([[[number]],[[number + 1], [number + 2, number + 3]]]) AS k FROM remote('127.0.0.{1,2}', numbers(10)); diff --git a/tests/queries/0_stateless/02183_combinator_if.reference b/tests/queries/0_stateless/02183_combinator_if.reference new file mode 100644 index 00000000000..05bda3751b5 --- /dev/null +++ b/tests/queries/0_stateless/02183_combinator_if.reference @@ -0,0 +1,40 @@ +\N +\N +Hello +Hello +Hello + +Hello +Hello +0 \N +1 \N +0 Hello +1 Hello +0 +1 +0 Hello +1 Hello +0 \N +1 \N +0 Hello +1 Hello +\N +\N +Hello +Hello +Hello + +Hello +Hello +0 \N +1 \N +0 Hello +1 Hello +0 +1 +0 Hello +1 Hello +0 \N +1 \N +0 Hello +1 Hello diff --git a/tests/queries/0_stateless/02183_combinator_if.sql b/tests/queries/0_stateless/02183_combinator_if.sql new file mode 100644 index 00000000000..ec716407939 --- /dev/null +++ b/tests/queries/0_stateless/02183_combinator_if.sql @@ -0,0 +1,40 @@ +SELECT anyIf(toNullable('Hello'), arrayJoin([1, NULL]) = 0); + +SELECT anyIf(toNullable('Hello'), arrayJoin([1, 1]) = 0); +SELECT anyIf(toNullable('Hello'), arrayJoin([1, 0]) = 0); +SELECT anyIf(toNullable('Hello'), arrayJoin([0, 1]) = 0); +SELECT anyIf(toNullable('Hello'), arrayJoin([0, 0]) = 0); + +SELECT anyIf('Hello', arrayJoin([1, NULL]) = 0); +SELECT anyIf('Hello', arrayJoin([1, NULL]) = 1); +SELECT anyIf('Hello', arrayJoin([1, NULL]) IS NULL); + +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, NULL]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, NULL, 0]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; + +SELECT number, anyIf('Hello', arrayJoin([1, NULL]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; +SELECT number, anyIf('Hello', arrayJoin([1, NULL, 0]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; + +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, 1]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, 0]) = 0) FROM numbers(2) GROUP BY number ORDER BY number; + + +SELECT anyIf(toNullable('Hello'), arrayJoin([1, NULL]) = 0) FROM remote('127.0.0.{1,2}', system.one); + +SELECT anyIf(toNullable('Hello'), arrayJoin([1, 1]) = 0) FROM remote('127.0.0.{1,2}', system.one); +SELECT anyIf(toNullable('Hello'), arrayJoin([1, 0]) = 0) FROM remote('127.0.0.{1,2}', system.one); +SELECT anyIf(toNullable('Hello'), arrayJoin([0, 1]) = 0) FROM remote('127.0.0.{1,2}', system.one); +SELECT anyIf(toNullable('Hello'), arrayJoin([0, 0]) = 0) FROM remote('127.0.0.{1,2}', system.one); + +SELECT anyIf('Hello', arrayJoin([1, NULL]) = 0) FROM remote('127.0.0.{1,2}', system.one); +SELECT anyIf('Hello', arrayJoin([1, NULL]) = 1) FROM remote('127.0.0.{1,2}', system.one); +SELECT anyIf('Hello', arrayJoin([1, NULL]) IS NULL) FROM remote('127.0.0.{1,2}', system.one); + +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, NULL]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, NULL, 0]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; + +SELECT number, anyIf('Hello', arrayJoin([1, NULL]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; +SELECT number, anyIf('Hello', arrayJoin([1, NULL, 0]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; + +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, 1]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; +SELECT number, anyIf(toNullable('Hello'), arrayJoin([1, 0]) = 0) FROM remote('127.0.0.{1,2}', numbers(2)) GROUP BY number ORDER BY number; diff --git a/tests/queries/0_stateless/02183_dictionary_date_types.reference b/tests/queries/0_stateless/02183_dictionary_date_types.reference new file mode 100644 index 00000000000..656217c9e74 --- /dev/null +++ b/tests/queries/0_stateless/02183_dictionary_date_types.reference @@ -0,0 +1,20 @@ +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Flat dictionary +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Hashed dictionary +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Hashed array dictionary +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Cache dictionary +2019-05-05 +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Direct dictionary +0 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +127.0.0.1 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +IPTrie dictionary +127.0.0.1/32 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Polygon dictionary +[[[(0,0),(0,1),(1,1),(1,0)]]] 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +0 0 1 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 +Range dictionary +0 0 1 2019-05-05 2019-05-05 2019-05-05 00:00:00 2019-05-05 00:00:00.000 diff --git a/tests/queries/0_stateless/02183_dictionary_date_types.sql b/tests/queries/0_stateless/02183_dictionary_date_types.sql new file mode 100644 index 00000000000..e06863d5e53 --- /dev/null +++ b/tests/queries/0_stateless/02183_dictionary_date_types.sql @@ -0,0 +1,211 @@ +DROP TABLE IF EXISTS 02183_dictionary_source_table; +CREATE TABLE 02183_dictionary_source_table +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) ENGINE=TinyLog; + +INSERT INTO 02183_dictionary_source_table VALUES (0, '2019-05-05', '2019-05-05', '2019-05-05', '2019-05-05'); + +SELECT * FROM 02183_dictionary_source_table; + +DROP DICTIONARY IF EXISTS 02183_flat_dictionary; +CREATE DICTIONARY 02183_flat_dictionary +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_source_table')) +LIFETIME(0) +LAYOUT(FLAT()); + +SELECT 'Flat dictionary'; +SELECT * FROM 02183_flat_dictionary; + +DROP DICTIONARY 02183_flat_dictionary; + +DROP DICTIONARY IF EXISTS 02183_hashed_dictionary; +CREATE DICTIONARY 02183_hashed_dictionary +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_source_table')) +LIFETIME(0) +LAYOUT(HASHED()); + +SELECT 'Hashed dictionary'; +SELECT * FROM 02183_hashed_dictionary; + +DROP DICTIONARY 02183_hashed_dictionary; + +DROP DICTIONARY IF EXISTS 02183_hashed_array_dictionary; +CREATE DICTIONARY 02183_hashed_array_dictionary +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_source_table')) +LIFETIME(0) +LAYOUT(HASHED_ARRAY()); + +SELECT 'Hashed array dictionary'; +SELECT * FROM 02183_hashed_array_dictionary; + +DROP DICTIONARY 02183_hashed_array_dictionary; + +DROP DICTIONARY IF EXISTS 02183_cache_dictionary; +CREATE DICTIONARY 02183_cache_dictionary +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_source_table')) +LIFETIME(0) +LAYOUT(CACHE(SIZE_IN_CELLS 10)); + +SELECT 'Cache dictionary'; +SELECT dictGet('02183_cache_dictionary', 'value_date', 0); +SELECT * FROM 02183_cache_dictionary; + +DROP DICTIONARY 02183_cache_dictionary; + +DROP DICTIONARY IF EXISTS 02183_direct_dictionary; +CREATE DICTIONARY 02183_direct_dictionary +( + id UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_source_table')) +LAYOUT(DIRECT()); + +SELECT 'Direct dictionary'; +SELECT * FROM 02183_direct_dictionary; + +DROP DICTIONARY 02183_direct_dictionary; +DROP TABLE 02183_dictionary_source_table; + +DROP TABLE IF EXISTS 02183_ip_trie_dictionary_source_table; +CREATE TABLE 02183_ip_trie_dictionary_source_table +( + prefix String, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) ENGINE=TinyLog; + +INSERT INTO 02183_ip_trie_dictionary_source_table VALUES ('127.0.0.1', '2019-05-05', '2019-05-05', '2019-05-05', '2019-05-05'); +SELECT * FROM 02183_ip_trie_dictionary_source_table; + +DROP DICTIONARY IF EXISTS 02183_ip_trie_dictionary; +CREATE DICTIONARY 02183_ip_trie_dictionary +( + prefix String, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY prefix +SOURCE(CLICKHOUSE(TABLE '02183_ip_trie_dictionary_source_table')) +LAYOUT(IP_TRIE(access_to_key_from_attributes 1)) +LIFETIME(0); + +SELECT 'IPTrie dictionary'; +SELECT * FROM 02183_ip_trie_dictionary; + +DROP DICTIONARY 02183_ip_trie_dictionary; +DROP TABLE 02183_ip_trie_dictionary_source_table; + +DROP TABLE IF EXISTS 02183_polygon_dictionary_source_table; +CREATE TABLE 02183_polygon_dictionary_source_table +( + key Array(Array(Array(Tuple(Float64, Float64)))), + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) ENGINE = TinyLog; + +INSERT INTO 02183_polygon_dictionary_source_table VALUES ([[[(0, 0), (0, 1), (1, 1), (1, 0)]]], '2019-05-05', '2019-05-05', '2019-05-05', '2019-05-05'); + +DROP DICTIONARY IF EXISTS 02183_polygon_dictionary; +CREATE DICTIONARY 02183_polygon_dictionary +( + key Array(Array(Array(Tuple(Float64, Float64)))), + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE '02183_polygon_dictionary_source_table')) +LAYOUT(POLYGON(store_polygon_key_column 1)) +LIFETIME(0); + +SELECT 'Polygon dictionary'; +SELECT * FROM 02183_polygon_dictionary; + +DROP TABLE 02183_polygon_dictionary_source_table; +DROP DICTIONARY 02183_polygon_dictionary; + +DROP TABLE IF EXISTS 02183_range_dictionary_source_table; +CREATE TABLE 02183_range_dictionary_source_table +( + key UInt64, + start UInt64, + end UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) ENGINE = TinyLog; + +INSERT INTO 02183_range_dictionary_source_table VALUES(0, 0, 1, '2019-05-05', '2019-05-05', '2019-05-05', '2019-05-05'); +SELECT * FROM 02183_range_dictionary_source_table; + +CREATE DICTIONARY 02183_range_dictionary +( + key UInt64, + start UInt64, + end UInt64, + value_date Date, + value_date_32 Date32, + value_date_time DateTime, + value_date_time_64 DateTime64 +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE '02183_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'Range dictionary'; +SELECT * FROM 02183_range_dictionary; + +DROP DICTIONARY 02183_range_dictionary; +DROP TABLE 02183_range_dictionary_source_table; diff --git a/tests/queries/0_stateless/02183_dictionary_no_attributes.reference b/tests/queries/0_stateless/02183_dictionary_no_attributes.reference new file mode 100644 index 00000000000..cbf3af697cd --- /dev/null +++ b/tests/queries/0_stateless/02183_dictionary_no_attributes.reference @@ -0,0 +1,44 @@ +0 +1 +FlatDictionary +1 +1 +0 +0 +1 +HashedDictionary +1 +1 +0 +0 +1 +HashedArrayDictionary +1 +1 +0 +0 +1 +CacheDictionary +1 +1 +0 +0 +1 +DirectDictionary +1 +1 +0 +0 +1 +IPTrieDictionary +1 +0 +127.0.0.0/32 +PolygonDictionary +1 +0 +[[[(0,0),(0,1),(1,1),(1,0)]]] +RangeHashedDictionary +0 0 1 +1 +0 diff --git a/tests/queries/0_stateless/02183_dictionary_no_attributes.sql b/tests/queries/0_stateless/02183_dictionary_no_attributes.sql new file mode 100644 index 00000000000..bd3d73594f8 --- /dev/null +++ b/tests/queries/0_stateless/02183_dictionary_no_attributes.sql @@ -0,0 +1,194 @@ +DROP TABLE IF EXISTS 02183_dictionary_test_table; +CREATE TABLE 02183_dictionary_test_table (id UInt64) ENGINE=TinyLog; +INSERT INTO 02183_dictionary_test_table VALUES (0), (1); + +SELECT * FROM 02183_dictionary_test_table; + +DROP DICTIONARY IF EXISTS 02183_flat_dictionary; +CREATE DICTIONARY 02183_flat_dictionary +( + id UInt64 +) +PRIMARY KEY id +LAYOUT(FLAT()) +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_test_table')) +LIFETIME(0); + +SELECT 'FlatDictionary'; + +SELECT dictGet('02183_flat_dictionary', 'value', 0); -- {serverError 36} +SELECT dictHas('02183_flat_dictionary', 0); +SELECT dictHas('02183_flat_dictionary', 1); +SELECT dictHas('02183_flat_dictionary', 2); + +SELECT * FROM 02183_flat_dictionary; + +DROP DICTIONARY 02183_flat_dictionary; + +DROP DICTIONARY IF EXISTS 02183_hashed_dictionary; +CREATE DICTIONARY 02183_hashed_dictionary +( + id UInt64 +) +PRIMARY KEY id +LAYOUT(HASHED()) +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_test_table')) +LIFETIME(0); + +SELECT 'HashedDictionary'; + +SELECT dictHas('02183_hashed_dictionary', 0); +SELECT dictHas('02183_hashed_dictionary', 1); +SELECT dictHas('02183_hashed_dictionary', 2); + +SELECT * FROM 02183_hashed_dictionary; + +DROP DICTIONARY 02183_hashed_dictionary; + +DROP DICTIONARY IF EXISTS 02183_hashed_array_dictionary; +CREATE DICTIONARY 02183_hashed_array_dictionary +( + id UInt64 +) +PRIMARY KEY id +LAYOUT(HASHED_ARRAY()) +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_test_table')) +LIFETIME(0); + +SELECT 'HashedArrayDictionary'; + +SELECT dictHas('02183_hashed_array_dictionary', 0); +SELECT dictHas('02183_hashed_array_dictionary', 1); +SELECT dictHas('02183_hashed_array_dictionary', 2); + +SELECT * FROM 02183_hashed_array_dictionary; + +DROP DICTIONARY 02183_hashed_array_dictionary; + +DROP DICTIONARY IF EXISTS 02183_cache_dictionary; +CREATE DICTIONARY 02183_cache_dictionary +( + id UInt64 +) +PRIMARY KEY id +LAYOUT(CACHE(SIZE_IN_CELLS 10)) +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_test_table')) +LIFETIME(0); + +SELECT 'CacheDictionary'; + +SELECT dictHas('02183_cache_dictionary', 0); +SELECT dictHas('02183_cache_dictionary', 1); +SELECT dictHas('02183_cache_dictionary', 2); + +SELECT * FROM 02183_cache_dictionary; + +DROP DICTIONARY 02183_cache_dictionary; + +DROP DICTIONARY IF EXISTS 02183_direct_dictionary; +CREATE DICTIONARY 02183_direct_dictionary +( + id UInt64 +) +PRIMARY KEY id +LAYOUT(HASHED()) +SOURCE(CLICKHOUSE(TABLE '02183_dictionary_test_table')) +LIFETIME(0); + +SELECT 'DirectDictionary'; + +SELECT dictHas('02183_direct_dictionary', 0); +SELECT dictHas('02183_direct_dictionary', 1); +SELECT dictHas('02183_direct_dictionary', 2); + +SELECT * FROM 02183_direct_dictionary; + +DROP DICTIONARY 02183_direct_dictionary; + +DROP TABLE 02183_dictionary_test_table; + +DROP TABLE IF EXISTS ip_trie_dictionary_source_table; +CREATE TABLE ip_trie_dictionary_source_table +( + prefix String +) ENGINE = TinyLog; + +INSERT INTO ip_trie_dictionary_source_table VALUES ('127.0.0.0'); + +DROP DICTIONARY IF EXISTS 02183_ip_trie_dictionary; +CREATE DICTIONARY 02183_ip_trie_dictionary +( + prefix String +) +PRIMARY KEY prefix +SOURCE(CLICKHOUSE(TABLE 'ip_trie_dictionary_source_table')) +LAYOUT(IP_TRIE()) +LIFETIME(0); + +SELECT 'IPTrieDictionary'; + +SELECT dictHas('02183_ip_trie_dictionary', tuple(IPv4StringToNum('127.0.0.0'))); +SELECT dictHas('02183_ip_trie_dictionary', tuple(IPv4StringToNum('127.0.0.1'))); +SELECT * FROM 02183_ip_trie_dictionary; + +DROP DICTIONARY 02183_ip_trie_dictionary; +DROP TABLE ip_trie_dictionary_source_table; + +DROP TABLE IF EXISTS 02183_polygon_dictionary_source_table; +CREATE TABLE 02183_polygon_dictionary_source_table +( + key Array(Array(Array(Tuple(Float64, Float64)))) +) ENGINE = TinyLog; + +INSERT INTO 02183_polygon_dictionary_source_table VALUES ([[[(0, 0), (0, 1), (1, 1), (1, 0)]]]); + +DROP DICTIONARY IF EXISTS 02183_polygon_dictionary; +CREATE DICTIONARY 02183_polygon_dictionary +( + key Array(Array(Array(Tuple(Float64, Float64)))) +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE '02183_polygon_dictionary_source_table')) +LAYOUT(POLYGON(store_polygon_key_column 1)) +LIFETIME(0); + +SELECT 'PolygonDictionary'; + +SELECT dictHas('02183_polygon_dictionary', tuple(0.5, 0.5)); +SELECT dictHas('02183_polygon_dictionary', tuple(1.5, 1.5)); +SELECT * FROM 02183_polygon_dictionary; + +DROP DICTIONARY 02183_polygon_dictionary; +DROP TABLE 02183_polygon_dictionary_source_table; + +DROP TABLE IF EXISTS 02183_range_dictionary_source_table; +CREATE TABLE 02183_range_dictionary_source_table +( + key UInt64, + start UInt64, + end UInt64 +) +ENGINE = TinyLog; + +INSERT INTO 02183_range_dictionary_source_table VALUES(0, 0, 1); + +DROP DICTIONARY IF EXISTS 02183_range_dictionary; +CREATE DICTIONARY 02183_range_dictionary +( + key UInt64, + start UInt64, + end UInt64 +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE '02183_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'RangeHashedDictionary'; +SELECT * FROM 02183_range_dictionary; +SELECT dictHas('02183_range_dictionary', 0, 0); +SELECT dictHas('02183_range_dictionary', 0, 2); + +DROP DICTIONARY 02183_range_dictionary; +DROP TABLE 02183_range_dictionary_source_table; diff --git a/tests/queries/0_stateless/02184_ipv6_parsing.reference b/tests/queries/0_stateless/02184_ipv6_parsing.reference new file mode 100644 index 00000000000..fd647e0b2a5 --- /dev/null +++ b/tests/queries/0_stateless/02184_ipv6_parsing.reference @@ -0,0 +1,2 @@ +2001:db9:85a3::8a2e:370:7334 +2001:db8:85a3::8a2e:370:7334 diff --git a/tests/queries/0_stateless/02184_ipv6_parsing.sh b/tests/queries/0_stateless/02184_ipv6_parsing.sh new file mode 100755 index 00000000000..986c1283002 --- /dev/null +++ b/tests/queries/0_stateless/02184_ipv6_parsing.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "select toString(toIPv6('2001:db9:85a3::8a2e:370:7334'))" +$CLICKHOUSE_CLIENT --param_var 2001:db8:85a3::8a2e:370:7334 -q "select {var:IPv6}" + diff --git a/tests/queries/0_stateless/02184_nested_tuple.reference b/tests/queries/0_stateless/02184_nested_tuple.reference new file mode 100644 index 00000000000..b435e2f28a6 --- /dev/null +++ b/tests/queries/0_stateless/02184_nested_tuple.reference @@ -0,0 +1,7 @@ +{"endUserIDs":{"_experience":{"aaid":{"id":"id_1","namespace":{"code":"code_1"},"primary":1},"mcid":{"id":"id_2","namespace":{"code":"code_2"},"primary":2}}}} +{"endUserIDs._experience":{"aaid":{"id":"id_1","namespace":{"code":"code_1"},"primary":1},"mcid":{"id":"id_2","namespace":{"code":"code_2"},"primary":2}}} +{"endUserIDs._experience.aaid":{"id":"id_1","namespace":{"code":"code_1"},"primary":1}} +{"endUserIDs._experience.aaid.id":"id_1"} +{"endUserIDs._experience.aaid.namespace":{"code":"code_1"}} +{"endUserIDs._experience.aaid.namespace.code":"code_1"} +{"endUserIDs._experience.aaid.primary":1} diff --git a/tests/queries/0_stateless/02184_nested_tuple.sql b/tests/queries/0_stateless/02184_nested_tuple.sql new file mode 100644 index 00000000000..67a20e3dce1 --- /dev/null +++ b/tests/queries/0_stateless/02184_nested_tuple.sql @@ -0,0 +1,38 @@ +DROP TABLE IF EXISTS t_nested_tuple; + +CREATE TABLE t_nested_tuple +( + endUserIDs Tuple( + _experience Tuple( + aaid Tuple( + id Nullable(String), + namespace Tuple( + code LowCardinality(Nullable(String)) + ), + primary LowCardinality(Nullable(UInt8)) + ), + mcid Tuple( + id Nullable(String), + namespace Tuple( + code LowCardinality(Nullable(String)) + ), + primary LowCardinality(Nullable(UInt8)) + ) + ) + ) +) +ENGINE = MergeTree ORDER BY tuple(); + +SET output_format_json_named_tuples_as_objects = 1; + +INSERT INTO t_nested_tuple FORMAT JSONEachRow {"endUserIDs":{"_experience":{"aaid":{"id":"id_1","namespace":{"code":"code_1"},"primary":1},"mcid":{"id":"id_2","namespace":{"code":"code_2"},"primary":2}}}}; + +SELECT * FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience.aaid FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience.aaid.id FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience.aaid.namespace FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience.aaid.namespace.code FROM t_nested_tuple FORMAT JSONEachRow; +SELECT endUserIDs._experience.aaid.primary FROM t_nested_tuple FORMAT JSONEachRow; + +DROP TABLE t_nested_tuple; diff --git a/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.reference b/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.reference new file mode 100644 index 00000000000..9b43f375e11 --- /dev/null +++ b/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.reference @@ -0,0 +1,3 @@ +1 0 18446744073709551615 value0 value1 value2 +('value0','value1','value2') +1 diff --git a/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.sql b/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.sql new file mode 100644 index 00000000000..6e892d9d246 --- /dev/null +++ b/tests/queries/0_stateless/02184_range_hashed_dictionary_outside_range_values.sql @@ -0,0 +1,36 @@ +DROP TABLE IF EXISTS 02184_range_dictionary_source_table; +CREATE TABLE 02184_range_dictionary_source_table +( + id UInt64, + start UInt64, + end UInt64, + value_0 String, + value_1 String, + value_2 String +) +ENGINE = TinyLog; + +INSERT INTO 02184_range_dictionary_source_table VALUES (1, 0, 18446744073709551615, 'value0', 'value1', 'value2'); + +DROP DICTIONARY IF EXISTS 02184_range_dictionary; +CREATE DICTIONARY 02184_range_dictionary +( + id UInt64, + start UInt64, + end UInt64, + value_0 String, + value_1 String, + value_2 String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02184_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT * FROM 02184_range_dictionary; +SELECT dictGet('02184_range_dictionary', ('value_0', 'value_1', 'value_2'), 1, 18446744073709551615); +SELECT dictHas('02184_range_dictionary', 1, 18446744073709551615); + +DROP DICTIONARY 02184_range_dictionary; +DROP TABLE 02184_range_dictionary_source_table; diff --git a/tests/queries/0_stateless/02185_orc_corrupted_file.reference b/tests/queries/0_stateless/02185_orc_corrupted_file.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02185_orc_corrupted_file.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02185_orc_corrupted_file.sh b/tests/queries/0_stateless/02185_orc_corrupted_file.sh new file mode 100755 index 00000000000..7d7a714cccc --- /dev/null +++ b/tests/queries/0_stateless/02185_orc_corrupted_file.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +cp $CUR_DIR/data_orc/corrupted.orc $USER_FILES_PATH/ + +${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' + diff --git a/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.reference b/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.reference new file mode 100644 index 00000000000..f8dc47039e9 --- /dev/null +++ b/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.reference @@ -0,0 +1,22 @@ +Source table +0 \N 5000 Value0 +0 5001 10000 Value1 +0 10001 \N Value2 +Dictionary convert_null_range_bound_to_open = 1 +0 5001 10000 Value1 +0 0 5000 Value0 +0 10001 18446744073709551615 Value2 +Value0 +Value1 +Value2 +1 +1 +1 +Dictionary convert_null_range_bound_to_open = 0 +0 5001 10000 Value1 +DefaultValue +Value1 +DefaultValue +0 +1 +0 diff --git a/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.sql b/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.sql new file mode 100644 index 00000000000..e6edee2ea18 --- /dev/null +++ b/tests/queries/0_stateless/02185_range_hashed_dictionary_open_ranges.sql @@ -0,0 +1,63 @@ +DROP TABLE IF EXISTS 02185_range_dictionary_source_table; +CREATE TABLE 02185_range_dictionary_source_table +( + id UInt64, + start Nullable(UInt64), + end Nullable(UInt64), + value String +) +ENGINE = TinyLog; + +INSERT INTO 02185_range_dictionary_source_table VALUES (0, NULL, 5000, 'Value0'), (0, 5001, 10000, 'Value1'), (0, 10001, NULL, 'Value2'); + +SELECT 'Source table'; +SELECT * FROM 02185_range_dictionary_source_table; + +DROP DICTIONARY IF EXISTS 02185_range_dictionary; +CREATE DICTIONARY 02185_range_dictionary +( + id UInt64, + start UInt64, + end UInt64, + value String DEFAULT 'DefaultValue' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02185_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED(convert_null_range_bound_to_open 1)) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'Dictionary convert_null_range_bound_to_open = 1'; +SELECT * FROM 02185_range_dictionary; +SELECT dictGet('02185_range_dictionary', 'value', 0, 0); +SELECT dictGet('02185_range_dictionary', 'value', 0, 5001); +SELECT dictGet('02185_range_dictionary', 'value', 0, 10001); +SELECT dictHas('02185_range_dictionary', 0, 0); +SELECT dictHas('02185_range_dictionary', 0, 5001); +SELECT dictHas('02185_range_dictionary', 0, 10001); + +DROP DICTIONARY 02185_range_dictionary; + +CREATE DICTIONARY 02185_range_dictionary +( + id UInt64, + start UInt64, + end UInt64, + value String DEFAULT 'DefaultValue' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02185_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED(convert_null_range_bound_to_open 0)) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'Dictionary convert_null_range_bound_to_open = 0'; +SELECT * FROM 02185_range_dictionary; +SELECT dictGet('02185_range_dictionary', 'value', 0, 0); +SELECT dictGet('02185_range_dictionary', 'value', 0, 5001); +SELECT dictGet('02185_range_dictionary', 'value', 0, 10001); +SELECT dictHas('02185_range_dictionary', 0, 0); +SELECT dictHas('02185_range_dictionary', 0, 5001); +SELECT dictHas('02185_range_dictionary', 0, 10001); + +DROP TABLE 02185_range_dictionary_source_table; diff --git a/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.reference b/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.reference new file mode 100644 index 00000000000..64994150f59 --- /dev/null +++ b/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.reference @@ -0,0 +1,18 @@ +Source table +1 2020-01-01 2100-01-01 Value0 +1 2020-01-02 2100-01-01 Value1 +1 2020-01-03 2100-01-01 Value2 +Dictionary .range_lookup_strategy = min +1 2020-01-01 2100-01-01 Value0 +1 2020-01-02 2100-01-01 Value1 +1 2020-01-03 2100-01-01 Value2 +Value0 +Value0 +Value0 +Dictionary .range_lookup_strategy = max +1 2020-01-01 2100-01-01 Value0 +1 2020-01-02 2100-01-01 Value1 +1 2020-01-03 2100-01-01 Value2 +Value0 +Value1 +Value2 diff --git a/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.sql b/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.sql new file mode 100644 index 00000000000..caafc5e76a2 --- /dev/null +++ b/tests/queries/0_stateless/02186_range_hashed_dictionary_intersecting_intervals.sql @@ -0,0 +1,64 @@ +DROP TABLE IF EXISTS 02186_range_dictionary_source_table; +CREATE TABLE 02186_range_dictionary_source_table +( + id UInt64, + start Date, + end Date, + value String +) +Engine = TinyLog; + +INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-01', '2100-01-01', 'Value0'); +INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-02', '2100-01-01', 'Value1'); +INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-03', '2100-01-01', 'Value2'); + +SELECT 'Source table'; +SELECT * FROM 02186_range_dictionary_source_table; + +DROP DICTIONARY IF EXISTS 02186_range_dictionary; +CREATE DICTIONARY 02186_range_dictionary +( + id UInt64, + start Date, + end Date, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02186_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED(range_lookup_strategy 'min')) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'Dictionary .range_lookup_strategy = min'; + +SELECT * FROM 02186_range_dictionary; + +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-01')); +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-02')); +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-03')); + +DROP DICTIONARY 02186_range_dictionary; + +CREATE DICTIONARY 02186_range_dictionary +( + id UInt64, + start Date, + end Date, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02186_range_dictionary_source_table')) +LAYOUT(RANGE_HASHED(range_lookup_strategy 'max')) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT 'Dictionary .range_lookup_strategy = max'; + +SELECT * FROM 02186_range_dictionary; + +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-01')); +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-02')); +select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-03')); + +DROP DICTIONARY 02186_range_dictionary; +DROP TABLE 02186_range_dictionary_source_table; diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.python b/tests/queries/0_stateless/02187_async_inserts_all_formats.python new file mode 100644 index 00000000000..0a909451259 --- /dev/null +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.python @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +import os +import sys + +CURDIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(CURDIR, 'helpers')) + +CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL') +CLICKHOUSE_TMP = os.environ.get('CLICKHOUSE_TMP') + +from pure_http_client import ClickHouseClient + +client = ClickHouseClient() + +def run_test(data_format, gen_data_template, settings): + print(data_format) + client.query("TRUNCATE TABLE t_async_insert") + + expected = client.query(gen_data_template.format("TSV")).strip() + data = client.query(gen_data_template.format(data_format), settings=settings,binary_result=True) + + insert_query = "INSERT INTO t_async_insert FORMAT {}".format(data_format) + client.query_with_data(insert_query, data, settings=settings) + + result = client.query("SELECT * FROM t_async_insert FORMAT TSV").strip() + if result != expected: + print("Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(data_format, expected, result)) + exit(1) + +formats = client.query("SELECT name FROM system.formats WHERE is_input AND is_output \ + AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf') ORDER BY name").strip().split('\n') + +# Generic formats +client.query("DROP TABLE IF EXISTS t_async_insert") +client.query("CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory") +gen_data_query = "SELECT number AS id, toString(number) AS s, range(number) AS arr FROM numbers(10) FORMAT {}" + +for data_format in formats: + run_test(data_format, gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1}) + +# LineAsString +client.query("DROP TABLE IF EXISTS t_async_insert") +client.query("CREATE TABLE t_async_insert (s String) ENGINE = Memory") +gen_data_query = "SELECT toString(number) AS s FROM numbers(10) FORMAT {}" + +run_test('LineAsString', gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1}) + +# TODO: add CapnProto and Protobuf + +print("OK") diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference new file mode 100644 index 00000000000..b4a5b6c3a42 --- /dev/null +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference @@ -0,0 +1,40 @@ +Arrow +ArrowStream +Avro +CSV +CSVWithNames +CSVWithNamesAndTypes +CustomSeparated +CustomSeparatedWithNames +CustomSeparatedWithNamesAndTypes +JSONCompactEachRow +JSONCompactEachRowWithNames +JSONCompactEachRowWithNamesAndTypes +JSONCompactStringsEachRow +JSONCompactStringsEachRowWithNames +JSONCompactStringsEachRowWithNamesAndTypes +JSONEachRow +JSONStringsEachRow +MsgPack +Native +ORC +Parquet +RowBinary +RowBinaryWithNames +RowBinaryWithNamesAndTypes +TSKV +TSV +TSVRaw +TSVRawWithNames +TSVRawWithNamesAndTypes +TSVWithNames +TSVWithNamesAndTypes +TabSeparated +TabSeparatedRaw +TabSeparatedRawWithNames +TabSeparatedRawWithNamesAndTypes +TabSeparatedWithNames +TabSeparatedWithNamesAndTypes +Values +LineAsString +OK diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.sh b/tests/queries/0_stateless/02187_async_inserts_all_formats.sh new file mode 100755 index 00000000000..4b0b8d84c58 --- /dev/null +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# We should have correct env vars from shell_config.sh to run this test +python3 "$CURDIR"/02187_async_inserts_all_formats.python diff --git a/tests/queries/0_stateless/02187_test_final_and_limit_modifier.reference b/tests/queries/0_stateless/02187_test_final_and_limit_modifier.reference new file mode 100644 index 00000000000..56bbfcf090c --- /dev/null +++ b/tests/queries/0_stateless/02187_test_final_and_limit_modifier.reference @@ -0,0 +1,2 @@ +something 1 +something 1 diff --git a/tests/queries/0_stateless/02187_test_final_and_limit_modifier.sql b/tests/queries/0_stateless/02187_test_final_and_limit_modifier.sql new file mode 100644 index 00000000000..7c4ae936865 --- /dev/null +++ b/tests/queries/0_stateless/02187_test_final_and_limit_modifier.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS test_02187; +CREATE TABLE test_02187 ( + info String, + id Int32 +) +ENGINE = ReplacingMergeTree(id) +ORDER BY id; + +INSERT INTO TABLE test_02187 VALUES ('nothing', 1); +INSERT INTO TABLE test_02187 VALUES ('something', 1); + +SELECT * FROM test_02187 FINAL; +SELECT * FROM test_02187 FINAL LIMIT 1; + + diff --git a/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference new file mode 100644 index 00000000000..0e4e614d264 --- /dev/null +++ b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference @@ -0,0 +1,8 @@ +Dictionary output +0 Value +Dictionary output +0 Value +Dictionary output +0 Value +Dictionary output +0 Value diff --git a/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql new file mode 100644 index 00000000000..a939c30b57b --- /dev/null +++ b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql @@ -0,0 +1,65 @@ +DROP TABLE IF EXISTS 02188_test_dictionary_source; +CREATE TABLE 02188_test_dictionary_source +( + id UInt64, + value String +) +ENGINE=TinyLog; + +INSERT INTO 02188_test_dictionary_source VALUES (0, 'Value'); + +DROP DICTIONARY IF EXISTS 02188_test_dictionary_simple_primary_key; +CREATE DICTIONARY 02188_test_dictionary_simple_primary_key +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source')) +LAYOUT(DIRECT()); + +SELECT 'Dictionary output'; +SELECT * FROM 02188_test_dictionary_simple_primary_key; +DROP DICTIONARY 02188_test_dictionary_simple_primary_key; + +CREATE DICTIONARY 02188_test_dictionary_simple_primary_key +( + id UInt64, + value String +) +PRIMARY KEY (id) +SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source')) +LAYOUT(DIRECT()); + +SELECT 'Dictionary output'; +SELECT * FROM 02188_test_dictionary_simple_primary_key; +DROP DICTIONARY 02188_test_dictionary_simple_primary_key; + +DROP DICTIONARY IF EXISTS 02188_test_dictionary_complex_primary_key; +CREATE DICTIONARY 02188_test_dictionary_complex_primary_key +( + id UInt64, + value String +) +PRIMARY KEY id, value +SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source')) +LAYOUT(COMPLEX_KEY_DIRECT()); + +SELECT 'Dictionary output'; +SELECT * FROM 02188_test_dictionary_complex_primary_key; +DROP DICTIONARY 02188_test_dictionary_complex_primary_key; + +CREATE DICTIONARY 02188_test_dictionary_complex_primary_key +( + id UInt64, + value String +) +PRIMARY KEY (id, value) +SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source')) +LAYOUT(COMPLEX_KEY_DIRECT()); + +SELECT 'Dictionary output'; +SELECT * FROM 02188_test_dictionary_complex_primary_key; +DROP DICTIONARY 02188_test_dictionary_complex_primary_key; + +DROP TABLE 02188_test_dictionary_source; diff --git a/tests/queries/0_stateless/data_orc/corrupted.orc b/tests/queries/0_stateless/data_orc/corrupted.orc new file mode 100644 index 00000000000..08f7ab951f9 Binary files /dev/null and b/tests/queries/0_stateless/data_orc/corrupted.orc differ diff --git a/tests/queries/0_stateless/helpers/pure_http_client.py b/tests/queries/0_stateless/helpers/pure_http_client.py index 9f79c4ac529..3335f141bb5 100644 --- a/tests/queries/0_stateless/helpers/pure_http_client.py +++ b/tests/queries/0_stateless/helpers/pure_http_client.py @@ -14,22 +14,23 @@ class ClickHouseClient: def __init__(self, host = CLICKHOUSE_SERVER_URL_STR): self.host = host - def query(self, query, connection_timeout = 1500): + def query(self, query, connection_timeout=1500, settings=dict(), binary_result=False): NUMBER_OF_TRIES = 30 DELAY = 10 + params = { + 'timeout_before_checking_execution_speed': 120, + 'max_execution_time': 6000, + 'database': CLICKHOUSE_DATABASE, + } + + # Add extra settings to params + params = {**params, **settings} + for i in range(NUMBER_OF_TRIES): - r = requests.post( - self.host, - params = { - 'timeout_before_checking_execution_speed': 120, - 'max_execution_time': 6000, - 'database': CLICKHOUSE_DATABASE - }, - timeout = connection_timeout, - data = query) + r = requests.post(self.host, params=params, timeout=connection_timeout, data=query) if r.status_code == 200: - return r.text + return r.content if binary_result else r.text else: print('ATTENTION: try #%d failed' % i) if i != (NUMBER_OF_TRIES-1): @@ -44,9 +45,22 @@ class ClickHouseClient: df = pd.read_csv(io.StringIO(data), sep = '\t') return df - def query_with_data(self, query, content): - content = content.encode('utf-8') - r = requests.post(self.host, data=content) + def query_with_data(self, query, data, connection_timeout=1500, settings=dict()): + params = { + 'query': query, + 'timeout_before_checking_execution_speed': 120, + 'max_execution_time': 6000, + 'database': CLICKHOUSE_DATABASE, + } + + headers = { + "Content-Type": "application/binary" + } + + # Add extra settings to params + params = {**params, **settings} + + r = requests.post(self.host, params=params, timeout=connection_timeout, data=data, headers=headers) result = r.text if r.status_code == 200: return result diff --git a/tests/testflows/kerberos/__init__.py b/tests/testflows/kerberos/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/testflows/kerberos/kerberos_env/clickhouse-service.yml b/tests/testflows/kerberos/kerberos_env/clickhouse-service.yml index 9f30ca3039a..45b975db00d 100644 --- a/tests/testflows/kerberos/kerberos_env/clickhouse-service.yml +++ b/tests/testflows/kerberos/kerberos_env/clickhouse-service.yml @@ -2,7 +2,7 @@ version: '2.3' services: clickhouse: - image: clickhouse/integration-test:21454 + image: clickhouse/integration-test expose: - "9000" - "9009" diff --git a/tests/testflows/ldap/authentication/regression.py b/tests/testflows/ldap/authentication/regression.py index c0c4495a9b3..177f486e18a 100755 --- a/tests/testflows/ldap/authentication/regression.py +++ b/tests/testflows/ldap/authentication/regression.py @@ -40,7 +40,6 @@ xfails = { def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): """ClickHouse integration with LDAP regression module. """ - top().terminating = False nodes = { "clickhouse": ("clickhouse1", "clickhouse2", "clickhouse3"), } diff --git a/tests/testflows/ldap/authentication/tests/authentications.py b/tests/testflows/ldap/authentication/tests/authentications.py index 13bd225058a..1902e0bc2cb 100644 --- a/tests/testflows/ldap/authentication/tests/authentications.py +++ b/tests/testflows/ldap/authentication/tests/authentications.py @@ -2,7 +2,7 @@ import random import time -from helpers.common import Pool, join +from helpers.common import Pool from testflows.core import * from testflows.asserts import error from ldap.authentication.tests.common import * @@ -107,13 +107,14 @@ def parallel_login(self, server, user_count=10, timeout=300, rbac=False): with Pool(4) as pool: try: for i in range(5): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout=timeout) - + for task in tasks: + task.result(timeout=timeout) + @TestScenario @Requirements( RQ_SRS_007_LDAP_Authentication_Invalid("1.0"), diff --git a/tests/testflows/ldap/authentication/tests/server_config.py b/tests/testflows/ldap/authentication/tests/server_config.py index 87835ddc7d3..5e0e145d035 100644 --- a/tests/testflows/ldap/authentication/tests/server_config.py +++ b/tests/testflows/ldap/authentication/tests/server_config.py @@ -234,9 +234,7 @@ def invalid_verification_cooldown_value(self, invalid_value, timeout=300): verification cooldown parameter is invalid. """ - error_message = (" Access(user directories): Could not parse LDAP server" - " \\`openldap1\\`: Poco::Exception. Code: 1000, e.code() = 0," - f" e.displayText() = Syntax error: Not a valid unsigned integer{': ' + invalid_value if invalid_value else invalid_value}") + error_message = f" Syntax error: Not a valid unsigned integer{': ' + invalid_value if invalid_value else invalid_value}" with Given("LDAP server configuration that uses a negative integer for the verification_cooldown parameter"): servers = {"openldap1": {"host": "openldap1", "port": "389", "enable_tls": "no", diff --git a/tests/testflows/ldap/external_user_directory/regression.py b/tests/testflows/ldap/external_user_directory/regression.py index aea3eeb2f57..bae019ceae7 100755 --- a/tests/testflows/ldap/external_user_directory/regression.py +++ b/tests/testflows/ldap/external_user_directory/regression.py @@ -40,7 +40,6 @@ xfails = { def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): """ClickHouse LDAP external user directory regression module. """ - top().terminating = False nodes = { "clickhouse": ("clickhouse1", "clickhouse2", "clickhouse3"), } diff --git a/tests/testflows/ldap/external_user_directory/tests/authentications.py b/tests/testflows/ldap/external_user_directory/tests/authentications.py index 34e5d11835c..830fe01501b 100644 --- a/tests/testflows/ldap/external_user_directory/tests/authentications.py +++ b/tests/testflows/ldap/external_user_directory/tests/authentications.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import random -from helpers.common import Pool, join +from helpers.common import Pool from testflows.core import * from testflows.asserts import error @@ -102,13 +102,14 @@ def parallel_login(self, server, user_count=10, timeout=300): * with valid username and invalid password """): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Requirements( @@ -135,12 +136,13 @@ def parallel_login_with_the_same_user(self, server, timeout=300): * with valid username and invalid password """): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Tags("custom config") @@ -192,12 +194,13 @@ def parallel_login_with_the_same_user_multiple_servers(self, server, timeout=300 * with valid username and invalid password """): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Tags("custom config") @@ -245,10 +248,11 @@ def parallel_login_with_multiple_servers(self, server, user_count=10, timeout=30 for i in range(10): for users in user_groups.values(): for check in checks: - tasks.append(pool.apply_async(check, (users, i, 50,))) + tasks.append(pool.submit(check, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Tags("custom config") @@ -299,10 +303,11 @@ def parallel_login_with_rbac_and_multiple_servers(self, server, user_count=10, t for i in range(10): for users in user_groups.values(): for check in checks: - tasks.append(pool.apply_async(check, (users, i, 50,))) + tasks.append(pool.submit(check, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Requirements( @@ -323,12 +328,13 @@ def parallel_login_with_rbac_users(self, server, user_count=10, timeout=300): try: with When("I login in parallel"): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Requirements( diff --git a/tests/testflows/ldap/external_user_directory/tests/restart.py b/tests/testflows/ldap/external_user_directory/tests/restart.py index 23b0911bb29..cfcf63d932f 100644 --- a/tests/testflows/ldap/external_user_directory/tests/restart.py +++ b/tests/testflows/ldap/external_user_directory/tests/restart.py @@ -1,6 +1,6 @@ import random -from helpers.common import Pool, join +from helpers.common import Pool from testflows.core import * from testflows.asserts import error @@ -267,12 +267,13 @@ def parallel_login(self, server=None, user_count=10, timeout=300): with When("I restart the server during parallel login of users in each group"): for users in user_groups.values(): for check in checks: - tasks.append(pool.apply_async(check, (users, 0, 25, True))) + tasks.append(pool.submit(check, (users, 0, 25, True))) - tasks.append(pool.apply_async(restart)) + tasks.append(pool.submit(restart)) finally: with Then("logins during restart should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) tasks = [] with Pool(4) as pool: @@ -280,10 +281,11 @@ def parallel_login(self, server=None, user_count=10, timeout=300): with When("I perform parallel login of users in each group after restart"): for users in user_groups.values(): for check in checks: - tasks.append(pool.apply_async(check, (users, 0, 10, False))) + tasks.append(pool.submit(check, (users, 0, 10, False))) finally: with Then("logins after restart should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestOutline(Feature) @Name("restart") diff --git a/tests/testflows/ldap/external_user_directory/tests/server_config.py b/tests/testflows/ldap/external_user_directory/tests/server_config.py index ac6cfa154a6..31e1c42da94 100644 --- a/tests/testflows/ldap/external_user_directory/tests/server_config.py +++ b/tests/testflows/ldap/external_user_directory/tests/server_config.py @@ -248,9 +248,7 @@ def invalid_verification_cooldown_value(self, invalid_value, timeout=300): verification cooldown parameter is invalid. """ - error_message = (" Access(user directories): Could not parse LDAP server" - " \\`openldap1\\`: Poco::Exception. Code: 1000, e.code() = 0," - f" e.displayText() = Syntax error: Not a valid unsigned integer{': ' + invalid_value if invalid_value else invalid_value}") + error_message = f" Syntax error: Not a valid unsigned integer{': ' + invalid_value if invalid_value else invalid_value}" with Given("LDAP server configuration that uses a negative integer for the verification_cooldown parameter"): servers = {"openldap1": {"host": "openldap1", "port": "389", "enable_tls": "no", diff --git a/tests/testflows/ldap/regression.py b/tests/testflows/ldap/regression.py index 1e6ddc47bd1..c35f34c971d 100755 --- a/tests/testflows/ldap/regression.py +++ b/tests/testflows/ldap/regression.py @@ -4,7 +4,7 @@ from testflows.core import * append_path(sys.path, "..") -from helpers.common import Pool, join, run_scenario +from helpers.common import Pool, join from helpers.argparser import argparser @TestModule @@ -13,22 +13,18 @@ from helpers.argparser import argparser def regression(self, local, clickhouse_binary_path, parallel=None, stress=None): """ClickHouse LDAP integration regression module. """ - top().terminating = False args = {"local": local, "clickhouse_binary_path": clickhouse_binary_path} if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.parallel = parallel - tasks = [] with Pool(3) as pool: try: - run_scenario(pool, tasks, Feature(test=load("ldap.authentication.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("ldap.external_user_directory.regression", "regression")), args) - run_scenario(pool, tasks, Feature(test=load("ldap.role_mapping.regression", "regression")), args) + Feature(test=load("ldap.authentication.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("ldap.external_user_directory.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("ldap.role_mapping.regression", "regression"), parallel=True, executor=pool)(**args) finally: - join(tasks) + join() if main(): regression() diff --git a/tests/testflows/ldap/role_mapping/regression.py b/tests/testflows/ldap/role_mapping/regression.py index b84fe3e827a..a2c70d8bd41 100755 --- a/tests/testflows/ldap/role_mapping/regression.py +++ b/tests/testflows/ldap/role_mapping/regression.py @@ -30,7 +30,6 @@ xfails = { def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): """ClickHouse LDAP role mapping regression module. """ - top().terminating = False nodes = { "clickhouse": ("clickhouse1", "clickhouse2", "clickhouse3"), } diff --git a/tests/testflows/ldap/role_mapping/tests/mapping.py b/tests/testflows/ldap/role_mapping/tests/mapping.py index ccdde9c06c8..4f018d05aff 100644 --- a/tests/testflows/ldap/role_mapping/tests/mapping.py +++ b/tests/testflows/ldap/role_mapping/tests/mapping.py @@ -2,7 +2,7 @@ from testflows.core import * from testflows.asserts import error -from helpers.common import Pool, join +from helpers.common import Pool from ldap.role_mapping.requirements import * from ldap.role_mapping.tests.common import * @@ -1053,12 +1053,13 @@ def group_removed_and_added_in_parallel(self, ldap_server, ldap_user, count=20, try: with When("user try to login while LDAP groups are added and removed in parallel"): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(remove_ldap_groups_in_parallel, (groups, i, 10,))) - tasks.append(pool.apply_async(add_ldap_groups_in_parallel,(ldap_user, role_names, i, 10,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(remove_ldap_groups_in_parallel, (groups, i, 10,))) + tasks.append(pool.submit(add_ldap_groups_in_parallel,(ldap_user, role_names, i, 10,))) finally: with Finally("it should work", flags=TE): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) finally: with Finally("I clean up all LDAP groups"): for group in groups: @@ -1105,12 +1106,13 @@ def user_removed_and_added_in_ldap_groups_in_parallel(self, ldap_server, ldap_us try: with When("user try to login while user is added and removed from LDAP groups in parallel"): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(remove_user_from_ldap_groups_in_parallel, (ldap_user, groups, i, 1,))) - tasks.append(pool.apply_async(add_user_to_ldap_groups_in_parallel, (ldap_user, groups, i, 1,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(remove_user_from_ldap_groups_in_parallel, (ldap_user, groups, i, 1,))) + tasks.append(pool.submit(add_user_to_ldap_groups_in_parallel, (ldap_user, groups, i, 1,))) finally: with Finally("it should work", flags=TE): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Requirements( @@ -1154,12 +1156,13 @@ def roles_removed_and_added_in_parallel(self, ldap_server, ldap_user, count=20, try: with When("user try to login while mapped roles are added and removed in parallel"): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(remove_roles_in_parallel, (role_names, i, 10,))) - tasks.append(pool.apply_async(add_roles_in_parallel, (role_names, i, 10,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(remove_roles_in_parallel, (role_names, i, 10,))) + tasks.append(pool.submit(add_roles_in_parallel, (role_names, i, 10,))) finally: with Finally("it should work", flags=TE): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) with And("I clean up all the roles"): for role_name in role_names: @@ -1213,12 +1216,13 @@ def parallel_login(self, ldap_server, ldap_user, user_count=10, timeout=200, rol * with valid username and invalid password """): for i in range(10): - tasks.append(pool.apply_async(login_with_valid_username_and_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_valid_username_and_invalid_password, (users, i, 50,))) - tasks.append(pool.apply_async(login_with_invalid_username_and_valid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_valid_username_and_invalid_password, (users, i, 50,))) + tasks.append(pool.submit(login_with_invalid_username_and_valid_password, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestScenario @Requirements( @@ -1395,10 +1399,11 @@ def parallel_login_with_multiple_servers(self, ldap_server, ldap_user, user_coun for i in range(10): for users in user_groups.values(): for check in checks: - tasks.append(pool.apply_async(check, (users, i, 50,))) + tasks.append(pool.submit(check, (users, i, 50,))) finally: with Then("it should work"): - join(tasks, timeout) + for task in tasks: + task.result(timeout=timeout) @TestFeature @Name("mapping") diff --git a/tests/testflows/rbac/helper/common.py b/tests/testflows/rbac/helper/common.py index e47b2dfd4bf..b1d4da536dd 100755 --- a/tests/testflows/rbac/helper/common.py +++ b/tests/testflows/rbac/helper/common.py @@ -8,7 +8,7 @@ from testflows.core.name import basename, parentname from testflows._core.testtype import TestSubType from testflows.core import * -from helpers.common import Pool, join, run_scenario, instrument_clickhouse_server_log +from helpers.common import instrument_clickhouse_server_log from rbac.helper.tables import table_types def permutations(table_count=1): diff --git a/tests/testflows/rbac/regression.py b/tests/testflows/rbac/regression.py index 590f384288b..4c133bd232e 100755 --- a/tests/testflows/rbac/regression.py +++ b/tests/testflows/rbac/regression.py @@ -27,7 +27,6 @@ issue_17653 = "https://github.com/ClickHouse/ClickHouse/issues/17653" issue_17655 = "https://github.com/ClickHouse/ClickHouse/issues/17655" issue_17766 = "https://github.com/ClickHouse/ClickHouse/issues/17766" issue_18110 = "https://github.com/ClickHouse/ClickHouse/issues/18110" -issue_18206 = "https://github.com/ClickHouse/ClickHouse/issues/18206" issue_21083 = "https://github.com/ClickHouse/ClickHouse/issues/21083" issue_21084 = "https://github.com/ClickHouse/ClickHouse/issues/21084" issue_25413 = "https://github.com/ClickHouse/ClickHouse/issues/25413" @@ -122,20 +121,6 @@ xfails = { [(Fail, issue_17655)], "privileges/public tables/sensitive tables": [(Fail, issue_18110)], - "privileges/system merges/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system ttl merges/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system moves/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system sends/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system fetches/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system restart replica/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], - "privileges/system replication queues/:/:/:/:/SYSTEM:": - [(Fail, issue_18206)], "privileges/: row policy/nested live:": [(Fail, issue_21083)], "privileges/: row policy/nested mat:": @@ -175,10 +160,9 @@ xflags = { @Specifications( SRS_006_ClickHouse_Role_Based_Access_Control ) -def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): +def regression(self, local, clickhouse_binary_path, stress=None): """RBAC regression. """ - top().terminating = False nodes = { "clickhouse": ("clickhouse1", "clickhouse2", "clickhouse3") @@ -186,8 +170,6 @@ def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.parallel = parallel with Cluster(local, clickhouse_binary_path, nodes=nodes, docker_compose_project_dir=os.path.join(current_dir(), "rbac_env")) as cluster: diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_column.py b/tests/testflows/rbac/tests/privileges/alter/alter_column.py index 4e8bfb0b53d..2be20d4e667 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_column.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_column.py @@ -699,14 +699,13 @@ def user_with_privileges_on_cluster(self, permutation, table_type, node=None): @TestSuite def scenario_parallelization(self, table_type, permutation): + args = {"table_type": table_type, "permutation": permutation} with Pool(7) as pool: - tasks = [] try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), - {"table_type": table_type, "permutation": permutation}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() @TestFeature @Requirements( @@ -719,13 +718,11 @@ def scenario_parallelization(self, table_type, permutation): (key,) for key in table_types.keys() ]) @Name("alter column") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, node="clickhouse1"): """Runs test suites above which check correctness over scenarios and permutations. """ self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress @@ -737,12 +734,10 @@ def feature(self, node="clickhouse1", stress=None, parallel=None): with Example(str(example)): with Pool(10) as pool: - tasks = [] try: for permutation in permutations(table_type): privileges = alter_column_privileges(permutation) - - run_scenario(pool, tasks, Suite(test=scenario_parallelization, name=privileges), - {"table_type": table_type, "permutation": permutation}) + args = {"table_type": table_type, "permutation": permutation} + Suite(test=scenario_parallelization, name=privileges, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_constraint.py b/tests/testflows/rbac/tests/privileges/alter/alter_constraint.py index 697cefdf056..c24109c8052 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_constraint.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_constraint.py @@ -1,7 +1,6 @@ import json from testflows.core import * -from testflows.core import threading from testflows.asserts import error from rbac.requirements import * @@ -282,11 +281,9 @@ def user_with_privileges_on_cluster(self, table_type, node=None): (key,) for key in table_types.keys() ]) @Name("alter constraint") -def feature(self, node="clickhouse1", parallel=None, stress=None): +def feature(self, stress=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress @@ -296,11 +293,12 @@ def feature(self, node="clickhouse1", parallel=None, stress=None): if table_type != "MergeTree" and not self.context.stress: continue + args = {"table_type" : table_type} + with Example(str(example)): with Pool(5) as pool: - tasks = [] try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), {"table_type" : table_type}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_index.py b/tests/testflows/rbac/tests/privileges/alter/alter_index.py index 78f7134a8b7..9bb1d72a004 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_index.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_index.py @@ -446,7 +446,7 @@ def user_with_privileges_on_cluster(self, table_type, node=None): (key,) for key in table_types.keys() ]) @Name("alter index") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, parallel=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) if parallel is not None: @@ -460,11 +460,12 @@ def feature(self, node="clickhouse1", stress=None, parallel=None): if table_type != "MergeTree" and not self.context.stress: continue + args = {"table_type" : table_type} + with Example(str(example)): with Pool(5) as pool: - tasks = [] try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), {"table_type" : table_type}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_settings.py b/tests/testflows/rbac/tests/privileges/alter/alter_settings.py index d5a8b73534e..a1a2b824a11 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_settings.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_settings.py @@ -171,13 +171,14 @@ def user_with_privileges_on_cluster(self, privilege, table_type, node=None): def scenario_parallelization(self, table_type, privilege): """Runs all scenarios in parallel for a given privilege. """ + args = {"table_type": table_type, "privilege": privilege} + with Pool(4) as pool: - tasks = [] try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario), {"table_type": table_type, "privilege": privilege}) + Scenario(test=scenario, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() @TestFeature @Requirements( @@ -190,13 +191,11 @@ def scenario_parallelization(self, table_type, privilege): (key,) for key in table_types.keys() ]) @Name("alter settings") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, node="clickhouse1"): """Runs test suites above which check correctness over scenarios and permutations """ self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress @@ -208,11 +207,9 @@ def feature(self, node="clickhouse1", stress=None, parallel=None): with Example(str(example)): with Pool(4) as pool: - tasks = [] try: for alias in aliases: - run_scenario(pool, tasks, Suite(test=scenario_parallelization, name=alias, - setup=instrument_clickhouse_server_log), - {"table_type": table_type, "privilege": alias}) + args = {"table_type": table_type, "privilege": alias} + Suite(test=scenario_parallelization, name=alias, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/alter/alter_ttl.py b/tests/testflows/rbac/tests/privileges/alter/alter_ttl.py index d6dbf13c586..419cf880f30 100755 --- a/tests/testflows/rbac/tests/privileges/alter/alter_ttl.py +++ b/tests/testflows/rbac/tests/privileges/alter/alter_ttl.py @@ -1,7 +1,6 @@ import json from testflows.core import * -from testflows.core import threading from testflows.asserts import error from rbac.requirements import * @@ -258,11 +257,9 @@ def user_with_privileges_on_cluster(self, table_type, node=None): (key,) for key in table_types.keys() ]) @Name("alter ttl") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress @@ -272,11 +269,12 @@ def feature(self, node="clickhouse1", stress=None, parallel=None): if table_type != "MergeTree" and not self.context.stress: continue + args = {"table_type" : table_type} + with Example(str(example)): with Pool(5) as pool: - tasks = [] try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), {"table_type" : table_type}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/create/create_table.py b/tests/testflows/rbac/tests/privileges/create/create_table.py index f0978a7960f..8f0a9f43771 100644 --- a/tests/testflows/rbac/tests/privileges/create/create_table.py +++ b/tests/testflows/rbac/tests/privileges/create/create_table.py @@ -829,20 +829,17 @@ def create_as_merge(self, node=None): RQ_SRS_006_RBAC_Privileges_CreateTable("1.0"), ) @Name("create table") -def feature(self, stress=None, parallel=None, node="clickhouse1"): +def feature(self, stress=None, node="clickhouse1"): """Check the RBAC functionality of CREATE TABLE. """ self.context.node = self.context.cluster.node(node) if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel - tasks = [] with Pool(10) as pool: try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, scenario) + Scenario(run=scenario, parallel=True, executor=pool) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/dictGet.py b/tests/testflows/rbac/tests/privileges/dictGet.py index 269998703be..4bee598bb9b 100644 --- a/tests/testflows/rbac/tests/privileges/dictGet.py +++ b/tests/testflows/rbac/tests/privileges/dictGet.py @@ -21,12 +21,12 @@ def dict_setup(node, table_name, dict_name, type="UInt64"): yield finally: - with Finally("I drop the table", flags=TE): - node.query(f"DROP TABLE IF EXISTS {table_name}") - - with And("I drop the dictionary", flags=TE): + with Finally("I drop the dictionary", flags=TE): node.query(f"DROP DICTIONARY IF EXISTS {dict_name}") + with And("I drop the table", flags=TE): + node.query(f"DROP TABLE IF EXISTS {table_name}") + @TestSuite def dictGet_granted_directly(self, node=None): """Run dictGet checks with privileges granted directly. @@ -651,37 +651,34 @@ def dictGetType_check(self, privilege, on, grant_target_name, user_name, type, n RQ_SRS_006_RBAC_Privileges_None("1.0") ) @Name("dictGet") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, node="clickhouse1"): """Check the RBAC functionality of dictGet. """ self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress with Pool(20) as pool: - tasks = [] try: - - run_scenario(pool, tasks, Suite(test=dictGet_granted_directly, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictGet_granted_via_role, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictGetOrDefault_granted_directly, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictGetOrDefault_granted_via_role, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictHas_granted_directly, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictHas_granted_via_role, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictGetHierarchy_granted_directly, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictGetHierarchy_granted_via_role, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictIsIn_granted_directly, setup=instrument_clickhouse_server_log)) - run_scenario(pool, tasks, Suite(test=dictIsIn_granted_via_role, setup=instrument_clickhouse_server_log)) + Suite(run=dictGet_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictGet_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictGetOrDefault_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictGetOrDefault_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictHas_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictHas_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictGetHierarchy_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictGetHierarchy_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictIsIn_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) + Suite(run=dictIsIn_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool) for example in dictGetType_granted_directly.examples: type, = example + args = {"type" : type} with Example(example): - run_scenario(pool, tasks, Suite(test=dictGetType_granted_directly, setup=instrument_clickhouse_server_log),{"type" : type}) - run_scenario(pool, tasks, Suite(test=dictGetType_granted_via_role, setup=instrument_clickhouse_server_log),{"type" : type}) + Suite(test=dictGetType_granted_directly, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) + Suite(test=dictGetType_granted_via_role, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/distributed_table.py b/tests/testflows/rbac/tests/privileges/distributed_table.py index eab3b6d34d6..c99e6363b4d 100755 --- a/tests/testflows/rbac/tests/privileges/distributed_table.py +++ b/tests/testflows/rbac/tests/privileges/distributed_table.py @@ -1319,13 +1319,12 @@ def cluster_tests(self, cluster, node=None): """ self.context.cluster_name = cluster - tasks = [] with Pool(3) as pool: try: for suite in loads(current_module(), Suite): - run_scenario(pool, tasks, Suite(test=suite)) + Suite(test=suite, parallel=True, executor=pool) finally: - join(tasks) + join() @TestFeature @Requirements( @@ -1340,13 +1339,11 @@ def feature(self, node="clickhouse1"): self.context.node2 = self.context.cluster.node("clickhouse2") self.context.node3 = self.context.cluster.node("clickhouse3") - tasks = [] with Pool(3) as pool: try: - run_scenario(pool, tasks, Feature(test=cluster_tests)) - run_scenario(pool, tasks, Scenario(test=local_user)) - run_scenario(pool, tasks, Scenario(test=multiple_node_user)) + Feature(run=cluster_tests, parallel=True, executor=pool) + Scenario(run=local_user, parallel=True, executor=pool) + Scenario(run=multiple_node_user, parallel=True, executor=pool) finally: - join(tasks) - + join() diff --git a/tests/testflows/rbac/tests/privileges/feature.py b/tests/testflows/rbac/tests/privileges/feature.py index 555860d8ed1..e68d71675ab 100755 --- a/tests/testflows/rbac/tests/privileges/feature.py +++ b/tests/testflows/rbac/tests/privileges/feature.py @@ -5,94 +5,94 @@ from rbac.helper.common import * @TestFeature @Name("privileges") def feature(self): - - tasks = [] + """Check RBAC privileges. + """ with Pool(10) as pool: try: - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.insert", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.select", "feature"), ), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.public_tables", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.distributed_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.grant_option", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.truncate", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.optimize", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.kill_query", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.kill_mutation", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.role_admin", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.dictGet", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.introspection", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.sources", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.admin_option", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.all_role", "feature")), {}) + Feature(run=load("rbac.tests.privileges.insert", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.select", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.public_tables", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.distributed_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.grant_option", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.truncate", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.optimize", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.kill_query", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.kill_mutation", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.role_admin", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.dictGet", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.introspection", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.sources", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.admin_option", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.all_role", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_tables", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_dictionaries", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_databases", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_columns", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_users", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_roles", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_quotas", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_settings_profiles", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.show.show_row_policies", "feature")), {}) + Feature(run=load("rbac.tests.privileges.show.show_tables", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_dictionaries", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_databases", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_columns", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_users", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_roles", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_quotas", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_settings_profiles", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.show.show_row_policies", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_column", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_index", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_constraint", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_ttl", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_settings", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_update", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_delete", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_freeze", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_fetch", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_move", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_user", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_role", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_row_policy", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_quota", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.alter.alter_settings_profile", "feature")), {}) + Feature(run=load("rbac.tests.privileges.alter.alter_column", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_index", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_constraint", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_ttl", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_settings", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_update", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_delete", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_freeze", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_fetch", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_move", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_user", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_role", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_row_policy", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_quota", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.alter.alter_settings_profile", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_database", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_dictionary", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_temp_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_user", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_role", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_row_policy", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_quota", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.create.create_settings_profile", "feature")), {}) + Feature(run=load("rbac.tests.privileges.create.create_database", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_dictionary", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_temp_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_user", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_role", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_row_policy", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_quota", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.create.create_settings_profile", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.attach.attach_database", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.attach.attach_dictionary", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.attach.attach_temp_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.attach.attach_table", "feature")), {}) + Feature(run=load("rbac.tests.privileges.attach.attach_database", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.attach.attach_dictionary", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.attach.attach_temp_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.attach.attach_table", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_database", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_dictionary", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_user", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_role", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_row_policy", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_quota", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.drop.drop_settings_profile", "feature")), {}) + Feature(run=load("rbac.tests.privileges.drop.drop_database", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_dictionary", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_user", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_role", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_row_policy", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_quota", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.drop.drop_settings_profile", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.detach.detach_database", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.detach.detach_dictionary", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.detach.detach_table", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.detach.detach_view", "feature")), {}) + Feature(run=load("rbac.tests.privileges.detach.detach_database", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.detach.detach_dictionary", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.detach.detach_table", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.detach.detach_view", "feature"), parallel=True, executor=pool) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.drop_cache", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.reload", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.flush", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.merges", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.moves", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.replication_queues", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.ttl_merges", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.restart_replica", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.sends", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.sync_replica", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.privileges.system.fetches", "feature")), {}) + Feature(run=load("rbac.tests.privileges.system.drop_cache", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.reload", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.flush", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.merges", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.moves", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.replication_queues", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.ttl_merges", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.restart_replica", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.sends", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.sync_replica", "feature"), parallel=True, executor=pool) + Feature(run=load("rbac.tests.privileges.system.fetches", "feature"), parallel=True, executor=pool) finally: - join(tasks) + join() Feature(test=load("rbac.tests.privileges.system.shutdown", "feature")) diff --git a/tests/testflows/rbac/tests/privileges/grant_option.py b/tests/testflows/rbac/tests/privileges/grant_option.py index 22b0118b9a4..ea5ff0ba66a 100644 --- a/tests/testflows/rbac/tests/privileges/grant_option.py +++ b/tests/testflows/rbac/tests/privileges/grant_option.py @@ -114,21 +114,19 @@ def grant_option_check(grant_option_target, grant_target, user_name, table_type, ("SELECT",), ]) @Name("grant option") -def feature(self, node="clickhouse1", stress=None, parallel=None): +def feature(self, stress=None, node="clickhouse1"): """Check the RBAC functionality of privileges with GRANT OPTION. """ self.context.node = self.context.cluster.node(node) - if parallel is not None: - self.context.parallel = parallel if stress is not None: self.context.stress = stress with Pool(12) as pool: - tasks = [] try: for example in self.examples: privilege, = example - run_scenario(pool, tasks, Suite(test=grant_option, name=privilege, setup=instrument_clickhouse_server_log), {"table_type": "MergeTree", "privilege": privilege}) + args = {"table_type": "MergeTree", "privilege": privilege} + Suite(test=grant_option, name=privilege, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/insert.py b/tests/testflows/rbac/tests/privileges/insert.py index 08c696ffc78..650e65b2fb0 100755 --- a/tests/testflows/rbac/tests/privileges/insert.py +++ b/tests/testflows/rbac/tests/privileges/insert.py @@ -485,9 +485,11 @@ def role_with_privilege_on_cluster(self, table_type, node=None): (key,) for key in table_types.keys() ]) @Name("insert") -def feature(self, table_type, parallel=None, stress=None, node="clickhouse1"): +def feature(self, table_type, stress=None, node="clickhouse1"): """Check the RBAC functionality of INSERT. """ + args = {"table_type" : table_type} + self.context.node = self.context.cluster.node(node) self.context.node1 = self.context.cluster.node("clickhouse1") @@ -496,13 +498,10 @@ def feature(self, table_type, parallel=None, stress=None, node="clickhouse1"): if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel - tasks = [] with Pool(10) as pool: try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), {"table_type" : table_type}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/select.py b/tests/testflows/rbac/tests/privileges/select.py index b84865998bc..b1a95b4be0b 100755 --- a/tests/testflows/rbac/tests/privileges/select.py +++ b/tests/testflows/rbac/tests/privileges/select.py @@ -408,20 +408,19 @@ def user_with_privilege_on_cluster(self, table_type, node=None): (key,) for key in table_types.keys() ]) @Name("select") -def feature(self, table_type, parallel=None, stress=None, node="clickhouse1"): +def feature(self, table_type, stress=None, node="clickhouse1"): """Check the RBAC functionality of SELECT. """ self.context.node = self.context.cluster.node(node) if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel - tasks = [] + args = {"table_type" : table_type} + with Pool(10) as pool: try: for scenario in loads(current_module(), Scenario): - run_scenario(pool, tasks, Scenario(test=scenario, setup=instrument_clickhouse_server_log), {"table_type" : table_type}) + Scenario(test=scenario, setup=instrument_clickhouse_server_log, parallel=True, executor=pool)(**args) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/privileges/sources.py b/tests/testflows/rbac/tests/privileges/sources.py index 19d32cf500a..e473c623955 100644 --- a/tests/testflows/rbac/tests/privileges/sources.py +++ b/tests/testflows/rbac/tests/privileges/sources.py @@ -73,7 +73,7 @@ def file(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the File source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File('')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("File source with privilege"): @@ -82,8 +82,8 @@ def file(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the File source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File()", settings = [("user", f"{user_name}")], - exitcode=42, message='Exception: Storage') + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File('')", settings = [("user", f"{user_name}")], + exitcode=0, message=None) with Scenario("File source with revoked privilege"): @@ -94,7 +94,7 @@ def file(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the File source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=File('')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -152,9 +152,9 @@ def url(self, privilege, grant_target_name, user_name, node=None): if node is None: node = self.context.node + table_name = f'table_{getuid()}' + with Scenario("URL source without privilege"): - table_name = f'table_{getuid()}' - with Given("The user has table privilege"): node.query(f"GRANT CREATE TABLE ON {table_name} TO {grant_target_name}") @@ -165,20 +165,18 @@ def url(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the URL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("URL source with privilege"): - with When(f"I grant {privilege}"): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the URL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("URL source with revoked privilege"): - with When(f"I grant {privilege}"): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") @@ -186,7 +184,7 @@ def url(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the URL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=URL('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -257,7 +255,7 @@ def remote(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the Remote source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("Remote source with privilege"): @@ -266,7 +264,7 @@ def remote(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the Remote source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("Remote source with revoked privilege"): @@ -278,7 +276,7 @@ def remote(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the Remote source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE = Distributed('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -349,7 +347,7 @@ def MySQL(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the MySQL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("MySQL source with privilege"): @@ -358,7 +356,7 @@ def MySQL(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the MySQL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("MySQL source with revoked privilege"): @@ -370,7 +368,7 @@ def MySQL(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the MySQL source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=MySQL('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -441,7 +439,7 @@ def ODBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the ODBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("ODBC source with privilege"): @@ -450,7 +448,7 @@ def ODBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the ODBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("ODBC source with revoked privilege"): @@ -462,7 +460,7 @@ def ODBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the ODBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=ODBC('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -533,7 +531,7 @@ def JDBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the JDBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("JDBC source with privilege"): @@ -542,7 +540,7 @@ def JDBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the JDBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("JDBC source with revoked privilege"): @@ -554,7 +552,7 @@ def JDBC(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the JDBC source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=JDBC('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -625,7 +623,7 @@ def HDFS(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the HDFS source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("HDFS source with privilege"): @@ -634,7 +632,7 @@ def HDFS(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the HDFS source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("HDFS source with revoked privilege"): @@ -646,7 +644,7 @@ def HDFS(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the HDFS source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=HDFS('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestSuite @@ -717,7 +715,7 @@ def S3(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT USAGE ON *.* TO {grant_target_name}") with Then("I check the user can't use the S3 source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) with Scenario("S3 source with privilege"): @@ -726,7 +724,7 @@ def S3(self, privilege, grant_target_name, user_name, node=None): node.query(f"GRANT {privilege} ON *.* TO {grant_target_name}") with Then("I check the user can use the S3 source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3()", settings = [("user", f"{user_name}")], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3('127.0.0.1')", settings = [("user", f"{user_name}")], exitcode=42, message='Exception: Storage') with Scenario("S3 source with revoked privilege"): @@ -738,7 +736,7 @@ def S3(self, privilege, grant_target_name, user_name, node=None): node.query(f"REVOKE {privilege} ON *.* FROM {grant_target_name}") with Then("I check the user cannot use the S3 source"): - node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3()", settings=[("user",user_name)], + node.query(f"CREATE TABLE {table_name} (x String) ENGINE=S3('127.0.0.1')", settings=[("user",user_name)], exitcode=exitcode, message=message) @TestFeature diff --git a/tests/testflows/rbac/tests/privileges/system/reload.py b/tests/testflows/rbac/tests/privileges/system/reload.py index bb8f91a0dd4..08df5803287 100644 --- a/tests/testflows/rbac/tests/privileges/system/reload.py +++ b/tests/testflows/rbac/tests/privileges/system/reload.py @@ -20,12 +20,12 @@ def dict_setup(node, table_name, dict_name): yield finally: - with Finally("I drop the table", flags=TE): - node.query(f"DROP TABLE IF EXISTS {table_name}") - - with And("I drop the dictionary", flags=TE): + with Finally("I drop the dictionary", flags=TE): node.query(f"DROP DICTIONARY IF EXISTS default.{dict_name}") + with And("I drop the table", flags=TE): + node.query(f"DROP TABLE IF EXISTS {table_name}") + @TestSuite def config_privileges_granted_directly(self, node=None): """Check that a user is able to execute `SYSTEM RELOAD CONFIG` if and only if diff --git a/tests/testflows/rbac/tests/syntax/grant_role.py b/tests/testflows/rbac/tests/syntax/grant_role.py index af69e5f3751..baede2445ee 100755 --- a/tests/testflows/rbac/tests/syntax/grant_role.py +++ b/tests/testflows/rbac/tests/syntax/grant_role.py @@ -58,7 +58,7 @@ def feature(self, node="clickhouse1"): RQ_SRS_006_RBAC_Grant_Role("1.0")]): with setup(0,0): with When("I grant nonexistent role to a nonexistent user"): - exitcode, message = errors.role_not_found_in_disk(name="role0") + exitcode, message = errors.role_not_found_in_disk(name="user0") node.query("GRANT role0 TO user0", exitcode=exitcode, message=message) with Scenario("I grant a role to multiple users", requirements=[ @@ -112,4 +112,4 @@ def feature(self, node="clickhouse1"): with setup(1,1): with When("I grant the role to the user"): exitcode, message = errors.cluster_not_found("fake_cluster") - node.query("GRANT ON CLUSTER fake_cluster role0 TO user0", exitcode=exitcode, message=message) \ No newline at end of file + node.query("GRANT ON CLUSTER fake_cluster role0 TO user0", exitcode=exitcode, message=message) diff --git a/tests/testflows/rbac/tests/syntax/revoke_role.py b/tests/testflows/rbac/tests/syntax/revoke_role.py index 0642dd6b0d4..6fe72b14f7e 100755 --- a/tests/testflows/rbac/tests/syntax/revoke_role.py +++ b/tests/testflows/rbac/tests/syntax/revoke_role.py @@ -70,7 +70,7 @@ def feature(self, node="clickhouse1"): RQ_SRS_006_RBAC_Revoke_Role("1.0")]): with setup(0,0): with When("I revoke nonexistent role from a nonexistent user"): - exitcode, message = errors.role_not_found_in_disk(name="role0") + exitcode, message = errors.role_not_found_in_disk(name="user0") node.query("REVOKE role0 FROM user0", exitcode=exitcode, message=message) with Scenario("I revoke a role from multiple users", requirements=[ diff --git a/tests/testflows/rbac/tests/views/feature.py b/tests/testflows/rbac/tests/views/feature.py index fd0241f389f..67f0dadb862 100755 --- a/tests/testflows/rbac/tests/views/feature.py +++ b/tests/testflows/rbac/tests/views/feature.py @@ -6,11 +6,10 @@ from rbac.helper.common import * @Name("views") def feature(self): - tasks = [] with Pool(3) as pool: try: - run_scenario(pool, tasks, Feature(test=load("rbac.tests.views.view", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.views.live_view", "feature")), {}) - run_scenario(pool, tasks, Feature(test=load("rbac.tests.views.materialized_view", "feature")), {}) + Feature(test=load("rbac.tests.views.view", "feature"), parallel=True, executor=pool) + Feature(test=load("rbac.tests.views.live_view", "feature"), parallel=True, executor=pool) + Feature(test=load("rbac.tests.views.materialized_view", "feature"), parallel=True, executor=pool) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/views/live_view.py b/tests/testflows/rbac/tests/views/live_view.py index ebb148b66e3..edda654d949 100755 --- a/tests/testflows/rbac/tests/views/live_view.py +++ b/tests/testflows/rbac/tests/views/live_view.py @@ -1122,19 +1122,16 @@ def refresh_with_revoked_privilege(self, grant_target_name, user_name, node=None RQ_SRS_006_RBAC_LiveView("1.0"), ) @Name("live view") -def feature(self, stress=None, parallel=None, node="clickhouse1"): +def feature(self, stress=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel with allow_experimental_live_view(self.context.node): - tasks = [] with Pool(3) as pool: try: for suite in loads(current_module(), Suite): - run_scenario(pool, tasks, suite) + Suite(test=suite, parallel=True, executor=pool) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/views/materialized_view.py b/tests/testflows/rbac/tests/views/materialized_view.py index d2192e81cf7..0464332d327 100755 --- a/tests/testflows/rbac/tests/views/materialized_view.py +++ b/tests/testflows/rbac/tests/views/materialized_view.py @@ -826,6 +826,7 @@ def select_with_revoked_select_privilege(self, user_name, grant_target_name, nod """Grant and revoke SELECT privilege on a view and check the user is unable to SELECT from it. """ view_name = f"view_{getuid()}" + exitcode, message = errors.not_enough_privileges(name=f"{user_name}") if node is None: node = self.context.node @@ -2259,18 +2260,15 @@ def insert_on_target_table(self, grant_target_name, user_name, node=None): RQ_SRS_006_RBAC_MaterializedView("1.0"), ) @Name("materialized view") -def feature(self, stress=None, parallel=None, node="clickhouse1"): +def feature(self, stress=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel - tasks = [] with Pool(3) as pool: try: for suite in loads(current_module(), Suite): - run_scenario(pool, tasks, suite) + Suite(test=suite, parallel=True, executor=pool) finally: - join(tasks) + join() diff --git a/tests/testflows/rbac/tests/views/view.py b/tests/testflows/rbac/tests/views/view.py index 3fd12164ad6..f4fb4550a75 100755 --- a/tests/testflows/rbac/tests/views/view.py +++ b/tests/testflows/rbac/tests/views/view.py @@ -627,6 +627,7 @@ def select_with_revoked_select_privilege(self, user_name, grant_target_name, nod """Grant and revoke SELECT privilege on a view and check the user is unable to SELECT from it. """ view_name = f"view_{getuid()}" + exitcode, message = errors.not_enough_privileges(name=f"{user_name}") if node is None: node = self.context.node @@ -1141,18 +1142,15 @@ def drop_with_revoked_privilege(self, grant_target_name, user_name, node=None): RQ_SRS_006_RBAC_View("1.0"), ) @Name("view") -def feature(self, stress=None, parallel=None, node="clickhouse1"): +def feature(self, stress=None, node="clickhouse1"): self.context.node = self.context.cluster.node(node) if stress is not None: self.context.stress = stress - if parallel is not None: - self.context.stress = parallel - tasks = [] with Pool(3) as pool: try: for suite in loads(current_module(), Suite): - run_scenario(pool, tasks, suite) + Suite(test=suite, parallel=True, executor=pool) finally: - join(tasks) + join() diff --git a/tests/testflows/regression.py b/tests/testflows/regression.py index bae1c5db90a..c803d9ef210 100755 --- a/tests/testflows/regression.py +++ b/tests/testflows/regression.py @@ -19,14 +19,14 @@ def regression(self, local, clickhouse_binary_path, stress=None): with Pool(8) as pool: try: Feature(test=load("example.regression", "regression"), parallel=True, executor=pool)(**args) - # run_scenario(pool, tasks, Feature(test=load("ldap.regression", "regression")), args) - # run_scenario(pool, tasks, Feature(test=load("rbac.regression", "regression")), args) - Feature(test=load("aes_encryption.regression", "regression"), parallel=True, executor=pool)(**args) - # Feature(test=load("map_type.regression", "regression"), parallel=True, executor=pool)(**args) - Feature(test=load("window_functions.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("ldap.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("rbac.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("aes_encryption.regression", "regression"), parallel=True, executor=pool)(**args) # TODO: fix it! + # Feature(test=load("map_type.regression", "regression"), parallel=True, executor=pool)(**args) # TODO: fix it! + Feature(test=load("window_functions.regression", "regression"), parallel=True, executor=pool)(**args) # TODO: fix it! Feature(test=load("datetime64_extended_range.regression", "regression"), parallel=True, executor=pool)(**args) Feature(test=load("kerberos.regression", "regression"), parallel=True, executor=pool)(**args) - Feature(test=load("extended_precision_data_types.regression", "regression"), parallel=True, executor=pool)(**args) + Feature(test=load("extended_precision_data_types.regression", "regression"), parallel=True, executor=pool)(**args) # TODO: fix it! finally: join() diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 706532e2ac9..7822f47ff88 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -2,7 +2,7 @@ if (USE_CLANG_TIDY) set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") endif () -if(MAKE_STATIC_LIBRARIES) +if(USE_STATIC_LIBRARIES) set(MAX_LINKER_MEMORY 3500) else() set(MAX_LINKER_MEMORY 2500) diff --git a/utils/c++expr b/utils/c++expr new file mode 100755 index 00000000000..c498e780d05 --- /dev/null +++ b/utils/c++expr @@ -0,0 +1,301 @@ +#!/usr/bin/env bash +set -e + +usage() { + cat <&2 +USAGE: c++expr [-c CXX | -C | -I] [-i INCLUDE] [-l LIB] [-b STEPS] [-t TESTS] [-o FILE] [-O CXX_OPTS...] [-g 'GLOBAL CODE'] 'MAIN CODE' +OPTIONS: + -c CXX use specified c++ compiler + -C use cmake + -I integrate into ClickHouse build tree in current directory + -i INC add #include + -l LIB link against LIB (only for -I or -C) + -b STEPS_NUM make program to benchmark specified code snippet and run tests with STEPS_NUM each + -b perf-top run infinite benchmark and show perf top + -t TESTS_NUM make program to benchmark specified code snippet and run TESTS_NUM tests + -o FILE do not run, just save binary executable file + -O CXX_OPTS forward option compiler (e.g. -O "-O3 -std=c++20") +EXAMPLES: + $ c++expr -g 'int fib(int n) { return n < 2 ? n : fib(n-2) + fib(n-1); }' 'OUT(fib(10)) OUT(fib(20)) OUT(fib(30))' + fib(10) -> 55 + fib(20) -> 6765 + fib(30) -> 832040 + $ c++expr -I -i Interpreters/Context.h 'OUT(sizeof(DB::Context))' + sizeof(DB::Context) -> 7776 + $ c++expr -I -i Common/Stopwatch.h -b 10000 'Stopwatch sw;' + Steps per test: 10000 + Test #0: 0.0178 us 5.61798e+07 sps + ... + Test #4: 0.0179 us 5.58659e+07 sps + Average: 0.0179 us 5.58659e+07 sps +EOF + exit 1 +} + +SOURCE_FILE=main.cpp +GLOBAL= +OUTPUT_EXECUTABLE= +INCS="vector iostream typeinfo cstdlib cmath sys/time.h" +LIBS="" +BENCHMARK_STEPS=0 +RUN_PERFTOP= +BENCHMARK_TESTS=5 +USE_CMAKE= +USE_CLICKHOUSE= +CXX=g++ +CXX_OPTS= +CMD_PARAMS= + +# +# Parse command line +# + +if [ "$1" == "--help" ] || [ -z "$1" ]; then usage; fi +while getopts "vc:CIi:l:b:t:o:O:g:" OPT; do + case "$OPT" in + v) set -x; ;; + c) CXX="$OPTARG"; ;; + C) USE_CMAKE=y; ;; + I) USE_CLICKHOUSE=y; LIBS="$LIBS clickhouse_common_io"; ;; + i) INCS="$INCS $OPTARG"; ;; + l) LIBS="$LIBS $OPTARG"; ;; + b) if [ "$OPTARG" = perf-top ]; then BENCHMARK_STEPS=-1; RUN_PERFTOP=y; else BENCHMARK_STEPS="$OPTARG"; fi; ;; + t) BENCHMARK_TESTS="$OPTARG"; ;; + o) OUTPUT_EXECUTABLE="$OPTARG"; ;; + O) CXX_OPTS="$CXX_OPTS $OPTARG"; ;; + g) GLOBAL="$OPTARG"; ;; + esac +done +shift $(( $OPTIND - 1 )) + +# +# Positional arguments +# + +EXPR=$1 +shift + +if [ -z "$EXPR" ]; then usage; fi + +# +# Arguments forwarded to program should go after main code and before -- +# + +while [ -n "$1" ] && [ "$1" != "--" ]; do + CMD_PARAMS="$CMD_PARAMS $1" + shift +done +if [ "$1" == "--" ]; then shift; fi + +# +# Setup workdir +# + +find_clickhouse_root () { + local DIR="`pwd`" + while [ $DIR != "/" ]; do + if [ ! -e "$DIR/CMakeLists.txt" ]; then + echo "error: $DIR has no CMakeLists.txt" + return 1 + fi + if grep "project(ClickHouse)" "$DIR/CMakeLists.txt" >/dev/null 2>&1; then + echo $DIR + return 0 + fi + DIR="`dirname $DIR`" + done + echo "error: unable to find Clickhouse root folder" + return 1 +} + +find_clickhouse_build () { + local CLICKHOUSE_ROOT="`find_clickhouse_root`" + if [ -e "$CLICKHOUSE_ROOT/build/CMakeCache.txt" ]; then + echo "$CLICKHOUSE_ROOT/build" + return 0 + fi + echo "error: $CLICKHOUSE_ROOT/build/CMakeCache.txt doesn't exist" + return 1 +} + +CALL_DIR=`pwd` +EXECUTABLE=cppexpr_$$ +EXECUTABLE_DIR=. + +if [ -n "$USE_CLICKHOUSE" ]; then + SUBDIR=cppexpr_$$ + WORKDIR=$CALL_DIR/$SUBDIR + if [ ! -e $CALL_DIR/CMakeLists.txt ]; then + echo "error: $CALL_DIR/CMakeLists.txt is required for integration" >&2 + exit 1 + fi + + CLICKHOUSE_ROOT="`find_clickhouse_root`" + BUILD_ROOT="`find_clickhouse_build`" + CLICKHOUSE_PATH="${WORKDIR/$CLICKHOUSE_ROOT}" + EXECUTABLE_DIR="${BUILD_ROOT}${CLICKHOUSE_PATH}" + + if [ -z "$CLICKHOUSE_ROOT" ] || [ -z "$BUILD_ROOT" ] || [ -z "$CLICKHOUSE_PATH" ]; then + echo "error: unable to locate ClickHouse" >&2 + exit 1 + fi + + cp $CALL_DIR/CMakeLists.txt $CALL_DIR/CMakeLists.txt.backup.$$ + echo "add_subdirectory ($SUBDIR)" >>$CALL_DIR/CMakeLists.txt + cleanup() { + mv $CALL_DIR/CMakeLists.txt.backup.$$ $CALL_DIR/CMakeLists.txt + rm -rf $WORKDIR + rm -rf ${BUILD_ROOT}${CLICKHOUSE_PATH} + } +else + WORKDIR=/var/tmp/cppexpr_$$ + cleanup() { + rm -rf $WORKDIR + } +fi + +mkdir -p $WORKDIR +cd $WORKDIR + +# +# Generate CMakeLists.txt +# +if [ -n "$USE_CMAKE" ]; then + cat <>CMakeLists.txt +project(CppExpr) +SET(PROJECT_NAME CppExpr) +SET(CMAKE_INCLUDE_CURRENT_DIR TRUE) +cmake_minimum_required(VERSION 2.8) +set(CMAKE_CXX_FLAGS -fPIC) +set(CMAKE_C_FLAGS -fPIC) +set(CMAKE_BUILD_TYPE Release) +set(SOURCES $SOURCE_FILE) +add_executable($EXECUTABLE \${SOURCES}) +EOF +fi + +# +# Generate CMakeLists.txt for integration +# +if [ -n "$USE_CLICKHOUSE" ]; then + cat <>CMakeLists.txt +add_executable($EXECUTABLE $SOURCE_FILE) +EOF +fi + +# +# Add libraries to CMakeLists.txt +# +if [ -n "$LIBS" ]; then + cat <>CMakeLists.txt +target_link_libraries($EXECUTABLE PRIVATE $LIBS) +EOF +fi + +# +# Generate source code +# +>$SOURCE_FILE +for INC in $INCS; do + echo "#include <$INC>" >> $SOURCE_FILE +done +cat <>$SOURCE_FILE + +#define OUT(expr) std::cout << #expr << " -> " << (expr) << std::endl; +size_t max_tests = $BENCHMARK_TESTS; +size_t max_steps = $BENCHMARK_STEPS; +$GLOBAL +int main(int argc, char** argv) { + (void)argc; (void)argv; + try { +EOF + +if [ $BENCHMARK_STEPS -eq 0 ]; then + cat <>$SOURCE_FILE + $EXPR +EOF +else + cat <>$SOURCE_FILE + std::cout << "Steps per test: " << max_steps << std::endl; + if (max_steps == 0) max_steps = 1; + double total = 0.0; + for (size_t test = 0; test < max_tests; test++) { + timeval beg, end; + gettimeofday(&beg, nullptr); + for (size_t step = 0; step < max_steps; step++) { + asm volatile("" ::: "memory"); + $EXPR + } + gettimeofday(&end, nullptr); + double interval = (end.tv_sec - beg.tv_sec)*1e6 + (end.tv_usec - beg.tv_usec); + std::cout << "Test #" << test << ": " << interval / max_steps << " us\t" << max_steps * 1e6 / interval << " sps" << std::endl; + total += interval; + } + std::cout << "Average: " << total / max_tests / max_steps << " us\t" << max_steps * 1e6 / (total / max_tests) << " sps" << std::endl; +EOF +fi + +cat <>$SOURCE_FILE + return 0; + } catch (std::exception& e) { + std::cerr << "unhandled exception (" << typeid(e).name() << "):" << e.what() << std::endl; + } catch (...) { + std::cerr << "unknown unhandled exception\n"; + } + return 1; +} +#ifdef OUT +#undef OUT +#endif +EOF + +# +# Compile +# +if [ -n "$USE_CMAKE" ]; then + if ! (cmake . && make); then + cat -n $SOURCE_FILE + cleanup + exit 1 + fi +elif [ -n "$USE_CLICKHOUSE" ]; then + if ! (cd $BUILD_ROOT && ninja $EXECUTABLE) >stdout.log 2>stderr.log; then + cat stdout.log + cat stderr.log >&2 + cat -n $SOURCE_FILE + cleanup + exit 1 + fi +else + RET=0 + $CXX $CXX_OPTS -I$CALL_DIR -o $EXECUTABLE $SOURCE_FILE || RET=$? + if [ $RET -ne 0 ]; then + cat -n $SOURCE_FILE + cleanup + exit $RET + fi +fi + +# +# Execute +# +RET=0 +if [ -z "$OUTPUT_EXECUTABLE" ]; then + if [ -z "$RUN_PERFTOP" ]; then + "$@" $EXECUTABLE_DIR/$EXECUTABLE $CMD_PARAMS || RET=$? + else + "$@" $EXECUTABLE_DIR/$EXECUTABLE $CMD_PARAMS & + PID=$! + perf top -p $PID + kill $PID + fi +else + cp $EXECUTABLE_DIR/$EXECUTABLE $CALL_DIR/$OUTPUT_EXECUTABLE +fi + +# +# Cleanup +# +cleanup +echo "Exit code: $RET" +exit $RET diff --git a/utils/check-style/check-style b/utils/check-style/check-style index d71ead57477..d178778a410 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -184,7 +184,9 @@ tables_with_database_column=( tests_with_database_column=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | grep -vP $EXCLUDE_DIRS | - xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | cut -d: -f1 | sort -u + xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | + grep -v -e ':--' -e ':#' | + cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_database_column[@]}"; do grep -qE database.*currentDatabase "$test_case" || { @@ -327,6 +329,11 @@ then fi # Check that there is no system-wide libraries/headers in use. +# +# NOTE: it is better to override find_path/find_library in cmake, but right now +# it is not possible, see [1] for the reference. +# +# [1]: git grep --recurse-submodules -e find_library -e find_path contrib if git grep -e find_path -e find_library -- :**CMakeLists.txt; then echo "There is find_path/find_library usage. ClickHouse should use everything bundled. Consider adding one more contrib module." fi diff --git a/utils/clickhouse-diagnostics/README.md b/utils/clickhouse-diagnostics/README.md index 991efefdf5a..a6f8ed298dd 100644 --- a/utils/clickhouse-diagnostics/README.md +++ b/utils/clickhouse-diagnostics/README.md @@ -1,3 +1,18 @@ +## Download + +Cloning whole repo will take a lot of time and disk space. The following commands will download only this directory. + +* Requires Git 2.19 + +``` +# mkdir chdiag +# cd chdiag +# git clone --depth 1 --filter=blob:none --no-checkout https://github.com/ClickHouse/ClickHouse +# cd ClickHouse +# git sparse-checkout set utils/clickhouse-diagnostics +# git checkout master -- utils/clickhouse-diagnostics +``` + ## Installation ``` diff --git a/utils/clickhouse-diagnostics/clickhouse-diagnostics b/utils/clickhouse-diagnostics/clickhouse-diagnostics index ffddee0bdc4..83c0af9cd11 100644 --- a/utils/clickhouse-diagnostics/clickhouse-diagnostics +++ b/utils/clickhouse-diagnostics/clickhouse-diagnostics @@ -953,7 +953,7 @@ def parse_version(version): """ Parse version string. """ - return [int(x) for x in version.strip().split('.')] + return [int(x) for x in version.strip().split('.') if x.isnumeric()] if __name__ == '__main__': diff --git a/utils/keeper-bench/Generator.cpp b/utils/keeper-bench/Generator.cpp index 852de07f2e1..77185813a2a 100644 --- a/utils/keeper-bench/Generator.cpp +++ b/utils/keeper-bench/Generator.cpp @@ -48,8 +48,59 @@ std::string generateRandomData(size_t size) return generateRandomString(size); } +void removeRecursive(Coordination::ZooKeeper & zookeeper, const std::string & path) +{ + namespace fs = std::filesystem; + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + Strings children; + auto list_callback = [promise, &children] (const ListResponse & response) + { + children = response.names; + + promise->set_value(); + }; + zookeeper.list(path, list_callback, nullptr); + future.get(); + + while (!children.empty()) + { + Coordination::Requests ops; + for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) + { + removeRecursive(zookeeper, fs::path(path) / children.back()); + ops.emplace_back(makeRemoveRequest(fs::path(path) / children.back(), -1)); + children.pop_back(); + } + auto multi_promise = std::make_shared>(); + auto multi_future = multi_promise->get_future(); + + auto multi_callback = [multi_promise] (const MultiResponse &) + { + multi_promise->set_value(); + }; + zookeeper.multi(ops, multi_callback); + multi_future.get(); + } + auto remove_promise = std::make_shared>(); + auto remove_future = remove_promise->get_future(); + + auto remove_callback = [remove_promise] (const RemoveResponse &) + { + remove_promise->set_value(); + }; + + zookeeper.remove(path, -1, remove_callback); + remove_future.get(); +} + + void CreateRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) { + removeRecursive(zookeeper, path_prefix); + auto promise = std::make_shared>(); auto future = promise->get_future(); auto create_callback = [promise] (const CreateResponse & response) @@ -85,6 +136,33 @@ ZooKeeperRequestPtr CreateRequestGenerator::generate() } +void SetRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) +{ + removeRecursive(zookeeper, path_prefix); + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + auto create_callback = [promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(); + }; + zookeeper.create(path_prefix, "", false, false, default_acls, create_callback); + future.get(); +} + +ZooKeeperRequestPtr SetRequestGenerator::generate() +{ + auto request = std::make_shared(); + request->path = path_prefix; + request->data = generateRandomData(data_size); + + return request; +} + + void GetRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) { auto promise = std::make_shared>(); @@ -233,6 +311,11 @@ std::unique_ptr getGenerator(const std::string & name) { return std::make_unique("/list_generator", 100000, 5); } + else if (name == "set_small_data") + { + return std::make_unique("/set_generator", 5); + } + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown generator {}", name); } diff --git a/utils/keeper-bench/Generator.h b/utils/keeper-bench/Generator.h index d6cc0eec335..1ff01b25ed4 100644 --- a/utils/keeper-bench/Generator.h +++ b/utils/keeper-bench/Generator.h @@ -104,4 +104,23 @@ private: std::optional paths_length; }; +class SetRequestGenerator final : public IGenerator +{ +public: + explicit SetRequestGenerator( + std::string path_prefix_ = "/set_generator", + uint64_t data_size_ = 5) + : path_prefix(path_prefix_) + , data_size(data_size_) + {} + + void startup(Coordination::ZooKeeper & zookeeper) override; + Coordination::ZooKeeperRequestPtr generate() override; + +private: + std::string path_prefix; + uint64_t data_size; +}; + + std::unique_ptr getGenerator(const std::string & name); diff --git a/utils/keeper-bench/main.cpp b/utils/keeper-bench/main.cpp index 378d7c2f6e4..3aa44422463 100644 --- a/utils/keeper-bench/main.cpp +++ b/utils/keeper-bench/main.cpp @@ -20,7 +20,7 @@ int main(int argc, char *argv[]) boost::program_options::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth()); desc.add_options() ("help", "produce help message") - ("generator", value()->default_value("create_small_data"), "query to execute") + ("generator", value()->default_value("set_small_data"), "query to execute") ("concurrency,c", value()->default_value(1), "number of parallel queries") ("delay,d", value()->default_value(1), "delay between intermediate reports in seconds (set 0 to disable reports)") ("iterations,i", value()->default_value(0), "amount of queries to be executed") diff --git a/utils/keeper-data-dumper/main.cpp b/utils/keeper-data-dumper/main.cpp index 485f8c09faf..0f86d34d334 100644 --- a/utils/keeper-data-dumper/main.cpp +++ b/utils/keeper-data-dumper/main.cpp @@ -37,9 +37,9 @@ void dumpMachine(std::shared_ptr machine) for (const auto & child : value.children) { if (key == "/") - keys.push(key + child); + keys.push(key + child.toString()); else - keys.push(key + "/" + child); + keys.push(key + "/" + child.toString()); } } std::cout << std::flush; diff --git a/website/blog/en/2016/evolution-of-data-structures-in-yandex-metrica.md b/website/blog/en/2016/evolution-of-data-structures-in-yandex-metrica.md index c9ea238bf5e..3e717e467c1 100644 --- a/website/blog/en/2016/evolution-of-data-structures-in-yandex-metrica.md +++ b/website/blog/en/2016/evolution-of-data-structures-in-yandex-metrica.md @@ -3,6 +3,7 @@ title: 'Evolution of Data Structures in Yandex.Metrica' image: 'https://blog-images.clickhouse.com/en/2016/evolution-of-data-structures-in-yandex-metrica/main.jpg' date: '2016-12-13' tags: ['Yandex.Metrica', 'data structures', 'LSM tree', 'columnar storage'] +author: 'Alexey Milovidov' --- [Yandex.Metrica](https://metrica.yandex.com/) takes in a stream of data representing events that took place on sites or on apps. Our task is to keep this data and present it in an analyzable form. The real challenge lies in trying to determine what form the processed results should be saved in so that they are easy to work with. During the development process, we had to completely change our approach to data storage organization several times. We started with MyISAM tables, then used LSM-trees and eventually came up with column-oriented database, ClickHouse. @@ -104,5 +105,3 @@ Effective hardware utilization is very important to us. In our experience, when To maximize efficiency, it's important to customize your solution to meet the needs of specific type of workload. There is no data structure that copes well with completely different scenarios. For example, it's clear that key-value databases don't work for analytical queries. The greater the load on the system, the narrower the specialization required. One should not be afraid to use completely different data structures for different tasks. We were able to set things up so that Yandex.Metrica's hardware was relatively inexpensive. This has allowed us to offer the service free of charge to even very large sites and mobile apps, even larger than Yanex‘s own, while competitors typically start asking for a paid subscription plan. - - diff --git a/website/blog/en/2016/yandex-opensources-clickhouse.md b/website/blog/en/2016/yandex-opensources-clickhouse.md index c2b01ba3125..16618824e5e 100644 --- a/website/blog/en/2016/yandex-opensources-clickhouse.md +++ b/website/blog/en/2016/yandex-opensources-clickhouse.md @@ -3,6 +3,7 @@ title: 'Yandex Opensources ClickHouse' image: 'https://blog-images.clickhouse.com/en/2016/yandex-opensources-clickhouse/main.jpg' date: '2016-06-15' tags: ['announcement', 'GitHub', 'license'] +author: 'Alexey Milovidov' --- Today [analytical DBMS ClickHouse](https://clickhouse.com/) initially developed internally at Yandex, became available to everyone. Source code is published on [GitHub](https://github.com/ClickHouse/ClickHouse) under Apache 2.0 license. diff --git a/website/blog/en/2017/clickhouse-at-data-scale-2017.md b/website/blog/en/2017/clickhouse-at-data-scale-2017.md index d7fcb035ee5..e288d6e455d 100644 --- a/website/blog/en/2017/clickhouse-at-data-scale-2017.md +++ b/website/blog/en/2017/clickhouse-at-data-scale-2017.md @@ -3,6 +3,7 @@ title: 'ClickHouse at Data@Scale 2017' image: 'https://blog-images.clickhouse.com/en/2017/clickhouse-at-data-scale-2017/main.jpg' date: '2017-06-15' tags: ['conference', 'Seattle', 'USA', 'America', 'events'] +author: 'Alexey Milovidov' --- ![iframe](https://www.youtube.com/embed/bSyQahMVZ7w) diff --git a/website/blog/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse.md b/website/blog/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse.md index d4a0923de37..0d754a3158d 100644 --- a/website/blog/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse.md +++ b/website/blog/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse.md @@ -3,6 +3,7 @@ title: 'How to speed up LZ4 decompression in ClickHouse?' image: 'https://blog-images.clickhouse.com/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse/main.jpg' date: '2019-06-25' tags: ['performance', 'lz4', 'article', 'decompression'] +author: 'Alexey Milovidov' --- When you run queries in [ClickHouse](https://clickhouse.com/), you might notice that the profiler often shows the `LZ_decompress_fast` function near the top. What is going on? This question had us wondering how to choose the best compression algorithm. diff --git a/website/blog/en/2020/five-methods-for-database-obfuscation.md b/website/blog/en/2020/five-methods-for-database-obfuscation.md index b13c443ea40..02be447ba0c 100644 --- a/website/blog/en/2020/five-methods-for-database-obfuscation.md +++ b/website/blog/en/2020/five-methods-for-database-obfuscation.md @@ -3,6 +3,7 @@ title: 'Five Methods For Database Obfuscation' image: 'https://blog-images.clickhouse.com/en/2020/five-methods-for-database-obfuscation/main.jpg' date: '2020-01-27' tags: ['article', 'obfuscation'] +author: 'Alexey Milovidov' --- ClickHouse users already know that its biggest advantage is its high-speed processing of analytical queries. But claims like this need to be confirmed with reliable performance testing. diff --git a/website/blog/en/2020/package-repository-behind-cdn.md b/website/blog/en/2020/package-repository-behind-cdn.md index e6b9bafaa4e..77919f12266 100644 --- a/website/blog/en/2020/package-repository-behind-cdn.md +++ b/website/blog/en/2020/package-repository-behind-cdn.md @@ -3,6 +3,7 @@ title: 'Package Repository Behind CDN' image: 'https://blog-images.clickhouse.com/en/2020/package-repository-behind-cdn/main.jpg' date: '2020-07-02' tags: ['article', 'CDN', 'Cloudflare', 'repository', 'deb', 'rpm', 'tgz'] +author: 'Ivan Blinkov' --- On initial open-source launch, ClickHouse packages were published at an independent repository implemented on Yandex infrastructure. We'd love to use the default repositories of Linux distributions, but, unfortunately, they have their own strict rules on third-party library usage and software compilation options. These rules happen to contradict with how ClickHouse is produced. In 2018 ClickHouse was added to [official Debian repository](https://packages.debian.org/sid/clickhouse-server) as an experiment, but it didn't get much traction. Adaptation to those rules ended up producing more like a demo version of ClickHouse with crippled performance and limited features. @@ -68,4 +69,3 @@ Or you can take a look at all key charts for `repo.clickhouse.com` together on a * CDN is a must-have if you want people from all over the world to download some artifacts that you produce. Beware the huge pay-for-traffic bills from most CDN providers though. * Generic technical system metrics and drill-downs are a good starting point, but not always enough. * Serverless is not a myth. Nowadays it is indeed possible to build useful products by just integrating various infrastructure services together, without any dedicated servers to take care of. - diff --git a/website/blog/en/2020/pixel-benchmark.md b/website/blog/en/2020/pixel-benchmark.md index 6714c18a8e4..632a56d5bc6 100644 --- a/website/blog/en/2020/pixel-benchmark.md +++ b/website/blog/en/2020/pixel-benchmark.md @@ -2,7 +2,7 @@ title: 'Running ClickHouse on an Android phone' image: 'https://blog-images.clickhouse.com/en/2020/pixel-benchmark/main.jpg' date: '2020-07-16' -author: '[Alexander Kuzmenkov](https://github.com/akuzm)' +author: 'Alexander Kuzmenkov' tags: ['Android', 'benchmark', 'experiment'] --- diff --git a/website/blog/en/2020/the-clickhouse-community.md b/website/blog/en/2020/the-clickhouse-community.md index 8fc049955fc..3e5c614430f 100644 --- a/website/blog/en/2020/the-clickhouse-community.md +++ b/website/blog/en/2020/the-clickhouse-community.md @@ -2,7 +2,7 @@ title: 'The ClickHouse Community' image: 'https://blog-images.clickhouse.com/en/2020/the-clickhouse-community/clickhouse-community-history.png' date: '2020-12-10' -author: '[Robert Hodges](https://github.com/hodgesrm)' +author: 'Robert Hodges' tags: ['community', 'open source', 'telegram', 'meetup'] --- diff --git a/website/blog/en/2021/clickhouse-inc.md b/website/blog/en/2021/clickhouse-inc.md index f11020b6a8d..1f420ef1dba 100644 --- a/website/blog/en/2021/clickhouse-inc.md +++ b/website/blog/en/2021/clickhouse-inc.md @@ -2,7 +2,7 @@ title: 'Introducing ClickHouse, Inc.' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-inc/home.png' date: '2021-09-20' -author: '[Alexey Milovidov](https://github.com/alexey-milovidov)' +author: 'Alexey Milovidov' tags: ['company', 'incorporation', 'yandex', 'community'] --- diff --git a/website/blog/en/2021/clickhouse-october-moscow-meetup.md b/website/blog/en/2021/clickhouse-october-moscow-meetup.md index 557c7f0f9c0..fb77a5912e5 100644 --- a/website/blog/en/2021/clickhouse-october-moscow-meetup.md +++ b/website/blog/en/2021/clickhouse-october-moscow-meetup.md @@ -2,7 +2,7 @@ title: 'ClickHouse Moscow Meetup October 19, 2021' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-october-moscow-meetup/featured.jpg' date: '2021-11-11' -author: '[Rich Raposa](https://github.com/rfraposa)' +author: 'Rich Raposa' tags: ['company', 'community'] --- diff --git a/website/blog/en/2021/clickhouse-raises-250m-series-b.md b/website/blog/en/2021/clickhouse-raises-250m-series-b.md index b476371013e..3f6fd9d3928 100644 --- a/website/blog/en/2021/clickhouse-raises-250m-series-b.md +++ b/website/blog/en/2021/clickhouse-raises-250m-series-b.md @@ -2,7 +2,7 @@ title: 'ClickHouse raises a $250M Series B at a $2B valuation...and we are hiring' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-raises-250m-series-b/featured.jpg' date: '2021-10-28' -author: '[Dorota Szeremeta](https://www.linkedin.com/in/dorota-szeremeta-a849b7/)' +author: 'Dorota Szeremeta' tags: ['company', 'investment'] --- diff --git a/website/blog/en/2021/clickhouse-v21.10-released.md b/website/blog/en/2021/clickhouse-v21.10-released.md index ed0aab88017..145f23ff129 100644 --- a/website/blog/en/2021/clickhouse-v21.10-released.md +++ b/website/blog/en/2021/clickhouse-v21.10-released.md @@ -2,7 +2,7 @@ title: 'ClickHouse v21.10 Released' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-10/featured.jpg' date: '2021-10-14' -author: '[Rich Raposa](https://github.com/rfraposa), [Alexey Milovidov](https://github.com/alexey-milovidov)' +author: 'Rich Raposa, Alexey Milovidov' tags: ['company', 'community'] --- diff --git a/website/blog/en/2021/clickhouse-v21.11-released.md b/website/blog/en/2021/clickhouse-v21.11-released.md index d7993c4219e..0f478848410 100644 --- a/website/blog/en/2021/clickhouse-v21.11-released.md +++ b/website/blog/en/2021/clickhouse-v21.11-released.md @@ -2,7 +2,7 @@ title: 'ClickHouse v21.11 Released' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-11/featured-dog.jpg' date: '2021-11-11' -author: '[Rich Raposa](https://github.com/rfraposa), [Alexey Milovidov](https://github.com/alexey-milovidov)' +author: 'Rich Raposa, Alexey Milovidov' tags: ['company', 'community'] --- diff --git a/website/blog/en/2021/clickhouse-v21.12-released.md b/website/blog/en/2021/clickhouse-v21.12-released.md index d1e6cddbc35..7b4c7862700 100644 --- a/website/blog/en/2021/clickhouse-v21.12-released.md +++ b/website/blog/en/2021/clickhouse-v21.12-released.md @@ -2,7 +2,7 @@ title: 'What''s New in ClickHouse 21.12' image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-12/featured.jpg' date: '2021-12-16' -author: '[Alexey Milovidov](https://github.com/alexey-milovidov), [Christoph Wurm](https://github.com/cwurm)' +author: 'Alexey Milovidov, Christoph Wurm' tags: ['company', 'community'] --- diff --git a/website/blog/en/2021/code-review.md b/website/blog/en/2021/code-review.md index 15dc727986d..d726dc7aa4f 100644 --- a/website/blog/en/2021/code-review.md +++ b/website/blog/en/2021/code-review.md @@ -2,7 +2,7 @@ title: 'The Tests Are Passing, Why Would I Read The Diff Again?' image: 'https://blog-images.clickhouse.com/en/2021/code-review/two-ducks.jpg' date: '2021-04-14' -author: '[Alexander Kuzmenkov](https://github.com/akuzm)' +author: 'Alexander Kuzmenkov' tags: ['code review', 'development'] --- diff --git a/website/blog/en/2021/fuzzing-clickhouse.md b/website/blog/en/2021/fuzzing-clickhouse.md index cd53f00930d..3fa518aecce 100644 --- a/website/blog/en/2021/fuzzing-clickhouse.md +++ b/website/blog/en/2021/fuzzing-clickhouse.md @@ -2,7 +2,7 @@ title: 'Fuzzing ClickHouse' image: 'https://blog-images.clickhouse.com/en/2021/fuzzing-clickhouse/some-checks-were-not-successful.png' date: '2021-03-11' -author: '[Alexander Kuzmenkov](https://github.com/akuzm)' +author: 'Alexander Kuzmenkov' tags: ['fuzzing', 'testing'] --- @@ -56,6 +56,3 @@ To see for yourself how the fuzzer works, you only need the normal ClickHouse cl ## Other Fuzzers The AST-based fuzzer we discussed is only one of the many kinds of fuzzers we have in ClickHouse. There is a [talk](https://www.youtube.com/watch?v=GbmK84ZwSeI&t=4481s) (in Russian, [slides are here](https://presentations.clickhouse.com/cpp_siberia_2021/)) by Alexey Milovidov that explores all the fuzzers we have. Another interesting recent development is application of pivoted query synthesis technique, implemented in [SQLancer](https://github.com/sqlancer/sqlancer), to ClickHouse. The authors are going to give [a talk about this](https://heisenbug-piter.ru/2021/spb/talks/nr1cwknssdodjkqgzsbvh/) soon, so stay tuned. - -_2021-03-11 [Alexander Kuzmenkov](https://github.com/akuzm)_ - diff --git a/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md b/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md index 96c2dccf260..a73f6dcf91d 100644 --- a/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md +++ b/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md @@ -2,7 +2,7 @@ title: 'How to Enable Predictive Capabilities in Clickhouse Databases' image: 'https://blog-images.clickhouse.com/en/2021/mindsdb-enables-predictive-capabilities-in-clickHouse/featured.png' date: '2021-12-14' -author: '[Ilya Yatsishin](https://github.com/qoega)' +author: 'Ilya Yatsishin' tags: ['company', 'how-to', 'MindsDB'] --- diff --git a/website/blog/en/2021/performance-test-1.md b/website/blog/en/2021/performance-test-1.md index 9b06e9e59e0..1564b1c8a76 100644 --- a/website/blog/en/2021/performance-test-1.md +++ b/website/blog/en/2021/performance-test-1.md @@ -2,7 +2,7 @@ title: 'Testing the Performance of ClickHouse' image: 'https://blog-images.clickhouse.com/en/2021/performance-testing-1/chebu-crop.jpg' date: '2021-08-19' -author: '[Alexander Kuzmenkov](https://github.com/akuzm)' +author: 'Alexander Kuzmenkov' tags: ['testing', 'performance'] --- diff --git a/website/blog/en/2021/reading-from-external-memory.md b/website/blog/en/2021/reading-from-external-memory.md index 2274c47c5a5..01d35d19018 100644 --- a/website/blog/en/2021/reading-from-external-memory.md +++ b/website/blog/en/2021/reading-from-external-memory.md @@ -2,7 +2,7 @@ title: 'A journey to io_uring, AIO and modern storage devices' image: 'https://blog-images.clickhouse.com/en/2021/reading-from-external-memory/all-single-read.png' date: '2021-03-09' -author: '[Ruslan Savchenko](https://github.com/savrus)' +author: 'Ruslan Savchenko' tags: ['Linux', 'benchmark', 'experiment'] --- @@ -67,4 +67,3 @@ We see that solid state device latencies are far better than HDD. For a single r So, how about testing modern IO interfaces in Linux? Continue reading the [full article](https://arxiv.org/pdf/2102.11198). 2021-03-09 [Ruslan Savchenko](https://github.com/savrus) - diff --git a/website/blog/en/2021/tests-visualization.md b/website/blog/en/2021/tests-visualization.md index 259cb4d8e34..8b927f8976a 100644 --- a/website/blog/en/2021/tests-visualization.md +++ b/website/blog/en/2021/tests-visualization.md @@ -2,7 +2,7 @@ title: 'Decorating a Christmas Tree With the Help Of Flaky Tests' image: 'https://blog-images.clickhouse.com/en/2021/tests-visualization/tests.png' date: '2021-12-27' -author: '[Alexey Milovidov](https://github.com/alexey-milovidov)' +author: 'Alexey Milovidov' tags: ['tests', 'ci', 'flaky', 'christmas', 'visualization'] --- diff --git a/website/blog/en/2022/clickhouse-v22.1-released.md b/website/blog/en/2022/clickhouse-v22.1-released.md new file mode 100644 index 00000000000..045d5367327 --- /dev/null +++ b/website/blog/en/2022/clickhouse-v22.1-released.md @@ -0,0 +1,248 @@ +--- +title: 'What''s New in ClickHouse 22.1' +image: 'https://blog-images.clickhouse.com/en/2022/clickhouse-v22-1/featured.jpg' +date: '2022-01-26' +author: 'Alexey Milovidov' +tags: ['company', 'community'] +--- + +22.1 is our first release in the new year. It includes 2,599 new commits from 133 contributors, including 44 new contributors: + +> 13DaGGeR, Adri Fernandez, Alexey Gusev, Anselmo D. Adams, Antonio Andelic, Ben, Boris Kuschel, Christoph Wurm, Chun-Sheng, Li, Dao, DimaAmega, Dmitrii Mokhnatkin, Harry-Lee, Justin Hilliard, MaxTheHuman, Meena-Renganathan, Mojtaba Yaghoobzadeh, N. Kolotov, Niek, Orkhan Zeynalli, Rajkumar, Ryad ZENINE, Sergei Trifonov, Suzy Wang, TABLUM.IO, Vitaly Artemyev, Xin Wang, Yatian Xu, Youenn Lebras, dalei2019, fanzhou, gulige, lgbo-ustc, minhthucdao, mreddy017, msirm, olevino, peter279k, save-my-heart, tekeri, usurai, zhoubintao, 李扬. + +Don't forget to run `SELECT * FROM system.contributors` on your production server! + +Let's describe the most important new features in 22.1. + +## Schema Inference + +Let's look at the following query as an example: + +``` +SELECT * FROM url('https://datasets.clickhouse.com/github_events_v2.native.xz', Native, +$$ + file_time DateTime, + event_type Enum('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, + 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, + 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, + 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, + 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, + 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), + actor_login LowCardinality(String), + repo_name LowCardinality(String), + created_at DateTime, + updated_at DateTime, + action Enum('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, + 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20), + comment_id UInt64, + body String, + path String, + position Int32, + line Int32, + ref LowCardinality(String), + ref_type Enum('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), + creator_user_login LowCardinality(String), + number UInt32, + title String, + labels Array(LowCardinality(String)), + state Enum('none' = 0, 'open' = 1, 'closed' = 2), + locked UInt8, + assignee LowCardinality(String), + assignees Array(LowCardinality(String)), + comments UInt32, + author_association Enum('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5), + closed_at DateTime, + merged_at DateTime, + merge_commit_sha String, + requested_reviewers Array(LowCardinality(String)), + requested_teams Array(LowCardinality(String)), + head_ref LowCardinality(String), + head_sha String, + base_ref LowCardinality(String), + base_sha String, + merged UInt8, + mergeable UInt8, + rebaseable UInt8, + mergeable_state Enum('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4), + merged_by LowCardinality(String), + review_comments UInt32, + maintainer_can_modify UInt8, + commits UInt32, + additions UInt32, + deletions UInt32, + changed_files UInt32, + diff_hunk String, + original_position UInt32, + commit_id String, + original_commit_id String, + push_size UInt32, + push_distinct_size UInt32, + member_login LowCardinality(String), + release_tag_name String, + release_name String, + review_state Enum('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5) +$$) +``` + +In this query we are importing data with the `url` table function. Data is posted on an HTTP server in a `.native.xz` file. The most annoying part of this query is that we have to specify the data structure and the format of this file. + +In the new ClickHouse release 22.1 it becomes much easier: + +``` +SELECT * FROM url('https://datasets.clickhouse.com/github_events_v2.native.xz') +``` + +Cannot be more easy! How is that possible? + +Firstly, we detect the data format automatically from the file extension. Here it is `.native.xz`, so we know that the data is compressed by `xz` (LZMA2) compression and is represented in `Native` format. The `Native` format already contains all information about the types and names of the columns, and we just read and use it. + +It works for every format that contains information about the data types: `Native`, `Avro`, `Parquet`, `ORC`, `Arrow` as well as `CSVWithNamesAndTypes`, `TSVWithNamesAndTypes`. + +And it works for every table function that reads files: `s3`, `file`, `hdfs`, `url`, `s3Cluster`, `hdfsCluster`. + +A lot of magic happens under the hood. It does not require reading the whole file in memory. For example, Parquet format has metadata at the end of file. So, we read the header first to find where the metadata is located, then do a range request to read the metadata about columns and their types, then continue to read the requested columns. And if the file is small, it will be read with a single request. + +If you want to extract the structure from the file without data processing, the DESCRIBE query is available: + +``` +DESCRIBE url('https://datasets.clickhouse.com/github_events_v2.native.xz') +``` + +Data structure can be also automatically inferred from `JSONEachRow`, `CSV`, `TSV`, `CSVWithNames`, `TSVWithNames`, `MsgPack`, `Values` and `Regexp` formats. + +For `CSV`, either Float64 or String is inferred. For `JSONEachRow` the inference of array types is supported, including multidimensional arrays. Arrays of non-uniform types are mapped to Tuples. And objects are mapped to the `Map` data type. + +If a format does not have column names (like `CSV` without a header), the names `c1`, `c2`, ... are used. + +File format is detected from the file extension: `csv`, `tsv`, `native`, `parquet`, `pb`, `ndjson`, `orc`... For example, `.ndjson` file is recognized as `JSONEachRow` format and `.csv` is recognized as header-less `CSV` format in ClickHouse, and if you want `CSVWithNames` you can specify the format explicitly. + +We support "schema on demand" queries. For example, the autodetected data types for `TSV` format are Strings, but you can refine the types in your query with the `::` operator: + +``` +SELECT c1 AS domain, uniq(c2::UInt64), count() AS cnt + FROM file('hits.tsv') + GROUP BY domain ORDER BY cnt DESC LIMIT 10 +``` + +As a bonus, `LineAsString` and `RawBLOB` formats also get type inference. Try this query to see how I prefer to read my favorite website: + +``` +SELECT extractTextFromHTML(*) + FROM url('https://news.ycombinator.com/', LineAsString); +``` + +Schema autodetection also works while creating `Merge`, `Distributed` and `ReplicatedMegreTree` tables. When you create the first replica, you have to specify the table structure. But when creating all the subsequent replicas, you only need `CREATE TABLE hits +ENGINE = ReplicatedMegreTree(...)` without listing the columns - the definition will be copied from another replica. + +This feature is implemented by **Pavel Kruglov** with the inspiration of initial work by **Igor Baliuk** and with additions by **ZhongYuanKai**. + +## Realtime Resource Usage In clickhouse-client + +`clickhouse-client` is my favorite user interface for ClickHouse. It is an example of how friendly every command line application should be. + +Now it shows realtime CPU and memory usage for the query directly in the progress bar: + +![resource usage](https://blog-images.clickhouse.com/en/2022/clickhouse-v22-1/progress.png) + +For distributed queries, we show both total memory usage and max memory usage per host. + +This feature was made possible by implementation of distributed metrics forwarding by **Dmitry Novik**. I have added this small visualization to clickhouse-client, and now it is possible to add similar info in every client using native ClickHouse protocol. + +## Parallel Query Processing On Replicas + +ClickHouse is a distributed MPP DBMS. It can scale up to use all CPU cores on one server and scale out to use computation resources of multiple shards in a cluster. + +But each shard usually contains more than one replica. And by default ClickHouse is using the resources of only one replica on every shard. E.g. if you have a cluster of 6 servers with 3 shards and two replicas on each, a query will use just three servers instead of all six. + +There was an option to enable `max_parallel_replicas`, but that option required specifying a "sampling key", it was inconvenient to use and did not scale well. + +Now we have a setting to enable the new parallel processing algorithm: `allow_experimental_parallel_reading_from_replicas`. If it is enabled, replicas will *dynamically* select and distribute the work across them. + +It works perfectly even if replicas have lower or higher amounts of computation resources. And it gives a complete result even if some replicas are stale. + +This feature was implemented by **Nikita Mikhaylov** + +## Service Discovery + +When adding or removing nodes in a cluster, now you don't have to edit the config on every server. Just use automatic cluster and servers will register itself: + +``` +1 + + + + + + /clickhouse/discovery/auto_cluster + 1 + + + +``` + +There is no need to edit the config when adding new replicas! + +This feature was implemented by **Vladimir Cherkasov**. + +## Sparse Encoding For Columns + +If a column contains mostly zeros, we can encode it in sparse format +and automatically optimize calculations! + +It is a special column encoding, similar to `LowCardinality`, but it's completely transparent and works automatically. + +``` +CREATE TABLE test.hits ... +ENGINE = MergeTree ORDER BY ... +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9 +``` + +It allows compressing data better and optimizes computations, because data in sparse columns will be processed directly in sparse format in memory. + +Sparse or full format is selected based on column statistics that is calculated on insert and updated on background merges. + +Developed by **Anton Popov**. + +We also want to make LowCardinality encoding automatic, stay tuned! + +## Diagnostic Tool For ClickHouse + +It is a gift from the Yandex Cloud team. They have a tool to collect a report about ClickHouse instances to provide all the needed information for support. They decided to contribute this tool to open-source! + +You can find the tool here: [utils/clickhouse-diagnostics](https://github.com/ClickHouse/ClickHouse/tree/master/ +utils/clickhouse-diagnostics) + +Developed by **Alexander Burmak**. + +## Integrations + +Plenty of new integrations were added in 22.1: + +Integration with **Hive** as a foreign table engine for SELECT queries, contributed by **Taiyang Li** and reviewed by **Ksenia Sumarokova**. + +Integration with **Azure Blob Storage** similar to S3, contributed by **Jakub Kuklis** and reviewed by **Ksenia Sumarokova**. + +Support for **hdfsCluster** table function similar to **s3Cluster**, contributed by **Zhichang Yu** and reviewed by **Nikita Mikhailov**. + +## Statistical Functions + +I hope you have always dreamed of calculating the Cramer's V and Theil's U coefficients in ClickHouse, because now we have these functions for you and you have to deal with it. + +``` +:) SELECT cramersV(URL, URLDomain) FROM test.hits + +0.98 + +:) SELECT cramersV(URLDomain, ResolutionWidth) FROM test.hits + +0.27 +``` + +It can calculate some sort of dependency between categorical (discrete) values. You can imagine it like this: there is a correlation function `corr` but it is only applicable for linear dependencies; there is a rank correlation function `rankCorr` but it is only applicable for ordered values. And now there are a few functions to calculate *something* for discrete values. + +Developers: **Artem Tsyganov**, **Ivan Belyaev**, **Alexey Milovidov**. + + +## ... And Many More + +Read the [full changelog](https://github.com/ClickHouse/ClickHouse/blob/master/CHANGELOG.md) for the 22.1 release and follow [the roadmap](https://github.com/ClickHouse/ClickHouse/issues/32513). diff --git a/website/templates/blog/content.html b/website/templates/blog/content.html index d3bb1e159d0..37c284a9721 100644 --- a/website/templates/blog/content.html +++ b/website/templates/blog/content.html @@ -33,6 +33,10 @@ + {% if page.meta.author %} +
Author: {{ page.meta.author|adjust_markdown_html }}
+ {% endif %} +
{{ page.meta.date }} {% if page.meta.tags %} diff --git a/website/templates/docs/sidebar-item.html b/website/templates/docs/sidebar-item.html index 6773b0d8af9..ca4b8f9a60e 100644 --- a/website/templates/docs/sidebar-item.html +++ b/website/templates/docs/sidebar-item.html @@ -4,7 +4,7 @@ {{ nav_item.title[:-14] }} {% elif nav_item.children %} {{ nav_item.title }} -