From 2cf56595ad2487ccaf02213667fd7fcf27ad3e73 Mon Sep 17 00:00:00 2001 From: proller Date: Mon, 12 Aug 2019 20:18:37 +0300 Subject: [PATCH 001/102] Fix build --- dbms/src/Functions/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt index 34f9dded76b..f495d6d8665 100644 --- a/dbms/src/Functions/CMakeLists.txt +++ b/dbms/src/Functions/CMakeLists.txt @@ -25,6 +25,7 @@ target_link_libraries(clickhouse_functions PRIVATE ${ZLIB_LIBRARIES} ${Boost_FILESYSTEM_LIBRARY} + ${CMAKE_DL_LIBS} ) if (OPENSSL_CRYPTO_LIBRARY) From 8d3446e51505e1a65db22612edffdb56781afdaf Mon Sep 17 00:00:00 2001 From: proller Date: Tue, 13 Aug 2019 19:25:54 +0300 Subject: [PATCH 002/102] cmake: fix cpuinfo --- cmake/find_cpuinfo.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/find_cpuinfo.cmake b/cmake/find_cpuinfo.cmake index f122026d2bb..9553372109b 100644 --- a/cmake/find_cpuinfo.cmake +++ b/cmake/find_cpuinfo.cmake @@ -1,8 +1,8 @@ option(USE_INTERNAL_CPUINFO_LIBRARY "Set to FALSE to use system cpuinfo library instead of bundled" ${NOT_UNBUNDLED}) # Now we have no contrib/libcpuinfo, use from system. -if (USE_INTERNAL_CPUINFO_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libcpuinfo/include") - #message (WARNING "submodule contrib/libcpuid is missing. to fix try run: \n git submodule update --init --recursive") +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libcpuinfo/include") + #message (WARNING "submodule contrib/libcpuinfo is missing. to fix try run: \n git submodule update --init --recursive") set (USE_INTERNAL_CPUINFO_LIBRARY 0) set (MISSING_INTERNAL_CPUINFO_LIBRARY 1) endif () @@ -12,7 +12,7 @@ if(NOT USE_INTERNAL_CPUINFO_LIBRARY) find_path(CPUINFO_INCLUDE_DIR NAMES cpuinfo.h PATHS ${CPUINFO_INCLUDE_PATHS}) endif() -if(CPUID_LIBRARY AND CPUID_INCLUDE_DIR) +if(CPUINFO_LIBRARY AND CPUINFO_INCLUDE_DIR) set(USE_CPUINFO 1) elseif(NOT MISSING_INTERNAL_CPUINFO_LIBRARY) set(CPUINFO_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/libcpuinfo/include) From 4f364ce61101ce985c1e2d461259d56ac5d83914 Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 14:25:28 +0300 Subject: [PATCH 003/102] Fix includes after processors merge Conflicts: dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp dbms/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp --- .../Processors/Formats/Impl/CapnProtoRowInputFormat.cpp | 9 ++++----- .../Processors/Formats/Impl/CapnProtoRowInputFormat.h | 4 ++-- .../Processors/Formats/Impl/ParquetBlockOutputFormat.cpp | 3 +-- .../Processors/Formats/Impl/ProtobufRowInputFormat.cpp | 3 +-- .../src/Processors/Formats/Impl/ProtobufRowInputFormat.h | 2 +- .../Processors/Formats/Impl/ProtobufRowOutputFormat.cpp | 4 +--- 6 files changed, 10 insertions(+), 15 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 2652304fcb0..a45d83052c2 100644 --- a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -1,14 +1,13 @@ -#include "config_formats.h" -#include // Y_IGNORE +#include "CapnProtoRowInputFormat.h" #if USE_CAPNP #include #include #include #include -#include // Y_IGNORE -#include // Y_IGNORE -#include // Y_IGNORE +#include +#include +#include #include #include #include diff --git a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index b7021ea7db7..c39969d21b0 100644 --- a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -1,10 +1,10 @@ #pragma once -#include + +#include "config_formats.h" #if USE_CAPNP #include #include - #include namespace DB diff --git a/dbms/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index e8196c5bf59..224b0e6d2d8 100644 --- a/dbms/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -1,5 +1,4 @@ -#include "config_formats.h" -#include +#include "ParquetBlockOutputFormat.h" #if USE_PARQUET diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index 09410a06c0c..25fecc5c642 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -1,5 +1,4 @@ -#include "config_formats.h" -#include +#include "ProtobufRowInputFormat.h" #if USE_PROTOBUF #include diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index ebc2283d25c..029b2c8329e 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "config_formats.h" #if USE_PROTOBUF #include diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp index 50f79dda993..30e27c68f1e 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp @@ -1,10 +1,8 @@ #include +#include "ProtobufRowOutputFormat.h" -#include "config_formats.h" #if USE_PROTOBUF -#include - #include #include #include From bc572a3439dff953a822df8fa3a135faa3ef20be Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 12:20:33 +0000 Subject: [PATCH 004/102] Fix build in gcc8 --- dbms/src/Compression/tests/gtest_compressionCodec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index 4ed547c54e5..30b9c736de3 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -27,7 +27,7 @@ /// For the expansion of gtest macros. #if defined(__clang__) #pragma clang diagnostic ignored "-Wdeprecated" -#elif defined (__GNUC__) +#elif defined (__GNUC__) && __GNUC__ >= 9 #pragma GCC diagnostic ignored "-Wdeprecated-copy" #endif From 0bfee3a975bb76b3ef4e52e6660ea2f7a5681491 Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 15:28:27 +0300 Subject: [PATCH 005/102] fix test link --- dbms/src/Common/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index d11a46a38d9..a0d7ef4f2b2 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -36,7 +36,7 @@ target_include_directories (simple_cache PRIVATE ${DBMS_INCLUDE_DIR}) target_link_libraries (simple_cache PRIVATE common) add_executable (compact_array compact_array.cpp) -target_link_libraries (compact_array PRIVATE clickhouse_common_io ${Boost_FILESYSTEM_LIBRARY}) +target_link_libraries (compact_array PRIVATE clickhouse_common_io c++fs) add_executable (radix_sort radix_sort.cpp) target_link_libraries (radix_sort PRIVATE clickhouse_common_io) From 38adbdb0cdeebbbfca575ca2724a4fcb6948bd42 Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 13:27:00 +0000 Subject: [PATCH 006/102] fix test link --- dbms/src/Common/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index a0d7ef4f2b2..2c99c85baec 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -36,7 +36,7 @@ target_include_directories (simple_cache PRIVATE ${DBMS_INCLUDE_DIR}) target_link_libraries (simple_cache PRIVATE common) add_executable (compact_array compact_array.cpp) -target_link_libraries (compact_array PRIVATE clickhouse_common_io c++fs) +target_link_libraries (compact_array PRIVATE clickhouse_common_io stdc++fs) add_executable (radix_sort radix_sort.cpp) target_link_libraries (radix_sort PRIVATE clickhouse_common_io) From 010672051ea322f9c17b4e2dd94407daba7155ef Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 13:55:54 +0000 Subject: [PATCH 007/102] Fix test link --- dbms/src/IO/tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt index 1c804b29c04..2c3dc307b18 100644 --- a/dbms/src/IO/tests/CMakeLists.txt +++ b/dbms/src/IO/tests/CMakeLists.txt @@ -59,10 +59,10 @@ target_link_libraries (write_int PRIVATE clickhouse_common_io) if (OS_LINUX OR OS_FREEBSD) add_executable(write_buffer_aio write_buffer_aio.cpp) - target_link_libraries (write_buffer_aio PRIVATE clickhouse_common_io ${Boost_FILESYSTEM_LIBRARY}) + target_link_libraries (write_buffer_aio PRIVATE clickhouse_common_io stdc++fs) add_executable(read_buffer_aio read_buffer_aio.cpp) - target_link_libraries (read_buffer_aio PRIVATE clickhouse_common_io ${Boost_FILESYSTEM_LIBRARY}) + target_link_libraries (read_buffer_aio PRIVATE clickhouse_common_io stdc++fs) endif () add_executable (zlib_buffers zlib_buffers.cpp) From c11300bb6239975d58c72452fb198d6af8582d10 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 12:01:36 +0000 Subject: [PATCH 008/102] link fix --- dbms/src/Interpreters/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/tests/CMakeLists.txt b/dbms/src/Interpreters/tests/CMakeLists.txt index d83c5975c08..b4f9fff1d36 100644 --- a/dbms/src/Interpreters/tests/CMakeLists.txt +++ b/dbms/src/Interpreters/tests/CMakeLists.txt @@ -56,7 +56,7 @@ target_link_libraries (expression_analyzer PRIVATE dbms clickhouse_storages_syst add_check(expression_analyzer) add_executable (users users.cpp) -target_link_libraries (users PRIVATE dbms clickhouse_common_config ${Boost_FILESYSTEM_LIBRARY}) +target_link_libraries (users PRIVATE dbms clickhouse_common_config stdc++fs) if (OS_LINUX) add_executable (internal_iotop internal_iotop.cpp) From c2bb96fcde94a07e17379d3e71b2b54760a6ce53 Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 14 Aug 2019 15:29:23 +0300 Subject: [PATCH 009/102] Fix includes after processors merge 2 Conflicts: dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp --- dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index a2f2fd33e24..3b7ab9e6c31 100644 --- a/dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -1,7 +1,6 @@ -#include "config_formats.h" -#include - +#include "ParquetBlockInputFormat.h" #if USE_PARQUET + #include #include #include From dae5dc60f681cc6bdd3da9f6314fcc52c199be82 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 16:38:20 +0300 Subject: [PATCH 010/102] Fix includes after processors merge 3 --- dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h index b6ca906ab83..96bd85e317d 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "config_formats.h" #if USE_PROTOBUF #include From 0f85ef6d57e809a7e5f86f1b77863bbf6abb54d2 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 15:18:19 +0000 Subject: [PATCH 011/102] link fix --- libs/libcommon/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 885a6f0ec0b..2744714a9c4 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -119,7 +119,7 @@ target_link_libraries (common ${Poco_Foundation_LIBRARY} ${CITYHASH_LIBRARIES} PRIVATE - ${Boost_FILESYSTEM_LIBRARY} + stdc++fs PUBLIC ${Boost_SYSTEM_LIBRARY} PRIVATE From b1fc89c36418dd7ab6522a5d5b9cbc85df118596 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 22:04:06 +0300 Subject: [PATCH 012/102] Fix likely/unlikely conflict with cython --- libs/libcommon/include/common/DateLUT.h | 2 +- libs/libcommon/include/common/DateLUTImpl.h | 8 +++++--- libs/libcommon/include/common/likely.h | 18 ++++++++++++------ libs/libcommon/include/common/memory.h | 4 +++- libs/libcommon/include/common/unlikely.h | 3 +++ 5 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 libs/libcommon/include/common/unlikely.h diff --git a/libs/libcommon/include/common/DateLUT.h b/libs/libcommon/include/common/DateLUT.h index 75a7457d101..451bfa4d991 100644 --- a/libs/libcommon/include/common/DateLUT.h +++ b/libs/libcommon/include/common/DateLUT.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "DateLUTImpl.h" #include #include #include diff --git a/libs/libcommon/include/common/DateLUTImpl.h b/libs/libcommon/include/common/DateLUTImpl.h index 344d363b0d7..49c5831a410 100644 --- a/libs/libcommon/include/common/DateLUTImpl.h +++ b/libs/libcommon/include/common/DateLUTImpl.h @@ -1,8 +1,8 @@ #pragma once -#include -#include -#include +#include "Types.h" +#include "DayNum.h" +#include "likely.h" #include #include @@ -932,3 +932,5 @@ public: #pragma GCC diagnostic pop #endif #endif + +#include "unlikely.h" diff --git a/libs/libcommon/include/common/likely.h b/libs/libcommon/include/common/likely.h index 87b4854b78a..338498af35f 100644 --- a/libs/libcommon/include/common/likely.h +++ b/libs/libcommon/include/common/likely.h @@ -1,9 +1,15 @@ -#pragma once - #if defined(_MSC_VER) -#define likely(x) (x) -#define unlikely(x) (x) +# if !defined(likely) +# define likely(x) (x) +# endif +# if !defined(unlikely) +# define unlikely(x) (x) +# endif #else -#define likely(x) (__builtin_expect(!!(x), 1)) -#define unlikely(x) (__builtin_expect(!!(x), 0)) +# if !defined(likely) +# define likely(x) (__builtin_expect(!!(x), 1)) +# endif +# if !defined(unlikely) +# define unlikely(x) (__builtin_expect(!!(x), 0)) +# endif #endif diff --git a/libs/libcommon/include/common/memory.h b/libs/libcommon/include/common/memory.h index 58070334ac1..01db2d5501c 100644 --- a/libs/libcommon/include/common/memory.h +++ b/libs/libcommon/include/common/memory.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include "likely.h" #if __has_include() #include @@ -79,3 +79,5 @@ ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) no #endif } + +#include "unlikely.h" diff --git a/libs/libcommon/include/common/unlikely.h b/libs/libcommon/include/common/unlikely.h new file mode 100644 index 00000000000..8be60191ffa --- /dev/null +++ b/libs/libcommon/include/common/unlikely.h @@ -0,0 +1,3 @@ +// Undo likely.h +#undef likely +#undef unlikely From 791096f5bbd7e1a5aed55557548db52af5f03762 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 22:42:38 +0300 Subject: [PATCH 013/102] Fix conflict with protobuf/stubs/atomicops.h --- dbms/src/Core/Defines.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/src/Core/Defines.h b/dbms/src/Core/Defines.h index 397495d800e..be957340bab 100644 --- a/dbms/src/Core/Defines.h +++ b/dbms/src/Core/Defines.h @@ -92,6 +92,7 @@ #endif /// Check for presence of address sanitizer +#if !defined(ADDRESS_SANITIZER) #if defined(__has_feature) #if __has_feature(address_sanitizer) #define ADDRESS_SANITIZER 1 @@ -99,7 +100,9 @@ #elif defined(__SANITIZE_ADDRESS__) #define ADDRESS_SANITIZER 1 #endif +#endif +#if !defined(THREAD_SANITIZER) #if defined(__has_feature) #if __has_feature(thread_sanitizer) #define THREAD_SANITIZER 1 @@ -107,7 +110,9 @@ #elif defined(__SANITIZE_THREAD__) #define THREAD_SANITIZER 1 #endif +#endif +#if !defined(MEMORY_SANITIZER) #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define MEMORY_SANITIZER 1 @@ -115,6 +120,7 @@ #elif defined(__MEMORY_SANITIZER__) #define MEMORY_SANITIZER 1 #endif +#endif /// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute. /// It is useful in case when compiler cannot see (and exploit) it, but UBSan can. From b3456276ccbc5d788f5ee0c661bebcec71ce71aa Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 15 Aug 2019 22:52:17 +0300 Subject: [PATCH 014/102] remove unlikely.h --- libs/libcommon/include/common/DateLUTImpl.h | 2 -- libs/libcommon/include/common/memory.h | 2 -- libs/libcommon/include/common/unlikely.h | 3 --- 3 files changed, 7 deletions(-) delete mode 100644 libs/libcommon/include/common/unlikely.h diff --git a/libs/libcommon/include/common/DateLUTImpl.h b/libs/libcommon/include/common/DateLUTImpl.h index 49c5831a410..2258620eb26 100644 --- a/libs/libcommon/include/common/DateLUTImpl.h +++ b/libs/libcommon/include/common/DateLUTImpl.h @@ -932,5 +932,3 @@ public: #pragma GCC diagnostic pop #endif #endif - -#include "unlikely.h" diff --git a/libs/libcommon/include/common/memory.h b/libs/libcommon/include/common/memory.h index 01db2d5501c..ab96cb593b9 100644 --- a/libs/libcommon/include/common/memory.h +++ b/libs/libcommon/include/common/memory.h @@ -79,5 +79,3 @@ ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) no #endif } - -#include "unlikely.h" diff --git a/libs/libcommon/include/common/unlikely.h b/libs/libcommon/include/common/unlikely.h deleted file mode 100644 index 8be60191ffa..00000000000 --- a/libs/libcommon/include/common/unlikely.h +++ /dev/null @@ -1,3 +0,0 @@ -// Undo likely.h -#undef likely -#undef unlikely From 2ac6fef2079bd8cb89e59bcb52fe626972f17ed0 Mon Sep 17 00:00:00 2001 From: proller Date: Fri, 16 Aug 2019 16:25:25 +0300 Subject: [PATCH 015/102] Fix macos build (do not use timer_t) --- dbms/src/Common/QueryProfiler.cpp | 2 ++ dbms/src/Common/QueryProfiler.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/Common/QueryProfiler.cpp b/dbms/src/Common/QueryProfiler.cpp index 51d139d8fe0..69b6799ea99 100644 --- a/dbms/src/Common/QueryProfiler.cpp +++ b/dbms/src/Common/QueryProfiler.cpp @@ -204,11 +204,13 @@ QueryProfilerBase::~QueryProfilerBase() template void QueryProfilerBase::tryCleanup() { +#if USE_INTERNAL_UNWIND_LIBRARY if (timer_id != nullptr && timer_delete(timer_id)) LOG_ERROR(log, "Failed to delete query profiler timer " + errnoToString(ErrorCodes::CANNOT_DELETE_TIMER)); if (previous_handler != nullptr && sigaction(pause_signal, previous_handler, nullptr)) LOG_ERROR(log, "Failed to restore signal handler after query profiler " + errnoToString(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER)); +#endif } template class QueryProfilerBase; diff --git a/dbms/src/Common/QueryProfiler.h b/dbms/src/Common/QueryProfiler.h index 48b5ffc8b2c..b6420ccc703 100644 --- a/dbms/src/Common/QueryProfiler.h +++ b/dbms/src/Common/QueryProfiler.h @@ -1,7 +1,7 @@ #pragma once #include - +#include #include #include @@ -43,8 +43,10 @@ private: Poco::Logger * log; +#if USE_INTERNAL_UNWIND_LIBRARY /// Timer id from timer_create(2) timer_t timer_id = nullptr; +#endif /// Pause signal to interrupt threads to get traces int pause_signal; From be5c099e02dd3ec7b87744f5a029aa12ad074ab2 Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 5 Sep 2019 19:13:25 +0300 Subject: [PATCH 016/102] wip --- cmake/find_orc.cmake | 9 ++++++++- contrib/CMakeLists.txt | 12 ++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cmake/find_orc.cmake b/cmake/find_orc.cmake index 3676bec1b6b..b9f758c8090 100644 --- a/cmake/find_orc.cmake +++ b/cmake/find_orc.cmake @@ -5,4 +5,11 @@ set(USE_INTERNAL_ORC_LIBRARY ON) if (ARROW_LIBRARY) set(USE_ORC 1) -endif() \ No newline at end of file +endif() + + +find_path(CYRUS_SASL_INCLUDE_DIR sasl/sasl.h) +find_library(CYRUS_SASL_SHARED_LIB sasl2) +if (NOT CYRUS_SASL_INCLUDE_DIR OR NOT CYRUS_SASL_SHARED_LIB) + set(USE_ORC 0) +endif() diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 96462de0190..ed537f755a1 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -11,14 +11,10 @@ endif () set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1) if (USE_INTERNAL_ORC_LIBRARY) - set(BUILD_JAVA OFF) - set (ANALYZE_JAVA OFF) - set (BUILD_CPP_TESTS OFF) - set (BUILD_TOOLS OFF) - option(BUILD_JAVA OFF) - option (ANALYZE_JAVA OFF) - option (BUILD_CPP_TESTS OFF) - option (BUILD_TOOLS OFF) + set(BUILD_JAVA OFF CACHE INTERNAL "") + set(ANALYZE_JAVA OFF CACHE INTERNAL "") + set(BUILD_CPP_TESTS OFF CACHE INTERNAL "") + set(BUILD_TOOLS OFF CACHE INTERNAL "") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/contrib/orc/cmake_modules") add_subdirectory(orc) endif() From 781304ea238b54035bbf258ca8c2735381194a4c Mon Sep 17 00:00:00 2001 From: proller Date: Thu, 5 Sep 2019 23:05:22 +0300 Subject: [PATCH 017/102] Fix build (orc, ...) --- contrib/arrow-cmake/orc_check.cmake | 126 +++++++++++++++ contrib/orc-cmake/CMakeLists.txt | 229 ++++++++++++++++++++++++++++ 2 files changed, 355 insertions(+) create mode 100644 contrib/arrow-cmake/orc_check.cmake create mode 100644 contrib/orc-cmake/CMakeLists.txt diff --git a/contrib/arrow-cmake/orc_check.cmake b/contrib/arrow-cmake/orc_check.cmake new file mode 100644 index 00000000000..ec1e53cc649 --- /dev/null +++ b/contrib/arrow-cmake/orc_check.cmake @@ -0,0 +1,126 @@ +# Not changed part of contrib/orc/c++/src/CMakeLists.txt + +INCLUDE(CheckCXXSourceCompiles) + +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int,char*[]){ + int f = open(\"/x/y\", O_RDONLY); + char buf[100]; + return pread(f, buf, 100, 1000) == 0; + }" + HAS_PREAD +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int,char*[]){ + struct tm time2020; + return !strptime(\"2020-02-02 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2020); + }" + HAS_STRPTIME +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int,char* argv[]){ + return static_cast(std::stoll(argv[0])); + }" + HAS_STOLL +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int,char*[]){ + int64_t x = 1; printf(\"%lld\",x); + }" + INT64_IS_LL +) + +CHECK_CXX_SOURCE_COMPILES(" + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored \"-Wdeprecated\" + #pragma clang diagnostic pop + #elif defined(__GNUC__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored \"-Wdeprecated\" + #pragma GCC diagnostic pop + #elif defined(_MSC_VER) + #pragma warning( push ) + #pragma warning( disable : 4996 ) + #pragma warning( pop ) + #else + unknownCompiler! + #endif + int main(int, char *[]) {}" + HAS_DIAGNOSTIC_PUSH +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + return std::isnan(1.0f); + }" + HAS_STD_ISNAN +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + std::mutex test_mutex; + std::lock_guard lock_mutex(test_mutex); + }" + HAS_STD_MUTEX +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + std::string func() { + std::string var = \"test\"; + return std::move(var); + } + int main(int, char *[]) {}" + NEEDS_REDUNDANT_MOVE +) + +INCLUDE(CheckCXXSourceRuns) + +CHECK_CXX_SOURCE_RUNS(" + #include + int main(int, char *[]) { + time_t t = -14210715; // 1969-07-20 12:34:45 + struct tm *ptm = gmtime(&t); + return !(ptm && ptm->tm_year == 69); + }" + HAS_PRE_1970 +) + +CHECK_CXX_SOURCE_RUNS(" + #include + #include + int main(int, char *[]) { + setenv(\"TZ\", \"America/Los_Angeles\", 1); + tzset(); + struct tm time2037; + struct tm time2038; + strptime(\"2037-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2037); + strptime(\"2038-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2038); + return mktime(&time2038) - mktime(&time2037) != 31536000; + }" + HAS_POST_2038 +) + +set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) +set(CMAKE_REQUIRED_LIBRARIES zlib) +CHECK_CXX_SOURCE_COMPILES(" + #define Z_PREFIX + #include + z_stream strm; + int main(int, char *[]) { + deflateReset(&strm); + }" + NEEDS_Z_PREFIX +) diff --git a/contrib/orc-cmake/CMakeLists.txt b/contrib/orc-cmake/CMakeLists.txt new file mode 100644 index 00000000000..066ba00aede --- /dev/null +++ b/contrib/orc-cmake/CMakeLists.txt @@ -0,0 +1,229 @@ +# modifyed copy of contrib/orc/c++/src/CMakeLists.txt +set(LIBRARY_INCLUDE ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include) +set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/src) + +set(PROTOBUF_INCLUDE_DIR ${Protobuf_INCLUDE_DIR}) +set(PROTOBUF_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE}) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX11_FLAGS} ${WARN_FLAGS}") + +INCLUDE(CheckCXXSourceCompiles) + +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int,char*[]){ + int f = open(\"/x/y\", O_RDONLY); + char buf[100]; + return pread(f, buf, 100, 1000) == 0; + }" + HAS_PREAD +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int,char*[]){ + struct tm time2020; + return !strptime(\"2020-02-02 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2020); + }" + HAS_STRPTIME +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int,char* argv[]){ + return static_cast(std::stoll(argv[0])); + }" + HAS_STOLL +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int,char*[]){ + int64_t x = 1; printf(\"%lld\",x); + }" + INT64_IS_LL +) + +CHECK_CXX_SOURCE_COMPILES(" + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored \"-Wdeprecated\" + #pragma clang diagnostic pop + #elif defined(__GNUC__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored \"-Wdeprecated\" + #pragma GCC diagnostic pop + #elif defined(_MSC_VER) + #pragma warning( push ) + #pragma warning( disable : 4996 ) + #pragma warning( pop ) + #else + unknownCompiler! + #endif + int main(int, char *[]) {}" + HAS_DIAGNOSTIC_PUSH +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + return std::isnan(1.0f); + }" + HAS_STD_ISNAN +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + std::mutex test_mutex; + std::lock_guard lock_mutex(test_mutex); + }" + HAS_STD_MUTEX +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + std::string func() { + std::string var = \"test\"; + return std::move(var); + } + int main(int, char *[]) {}" + NEEDS_REDUNDANT_MOVE +) + +INCLUDE(CheckCXXSourceRuns) + +CHECK_CXX_SOURCE_RUNS(" + #include + int main(int, char *[]) { + time_t t = -14210715; // 1969-07-20 12:34:45 + struct tm *ptm = gmtime(&t); + return !(ptm && ptm->tm_year == 69); + }" + HAS_PRE_1970 +) + +CHECK_CXX_SOURCE_RUNS(" + #include + #include + int main(int, char *[]) { + setenv(\"TZ\", \"America/Los_Angeles\", 1); + tzset(); + struct tm time2037; + struct tm time2038; + strptime(\"2037-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2037); + strptime(\"2038-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2038); + return mktime(&time2038) - mktime(&time2037) != 31536000; + }" + HAS_POST_2038 +) + +set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) +set(CMAKE_REQUIRED_LIBRARIES zlib) +CHECK_CXX_SOURCE_COMPILES(" + #define Z_PREFIX + #include + z_stream strm; + int main(int, char *[]) { + deflateReset(&strm); + }" + NEEDS_Z_PREFIX +) + +configure_file ( + "${LIBRARY_DIR}/Adaptor.hh.in" + "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" + ) + + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/orc_proto.pb.h ${CMAKE_CURRENT_BINARY_DIR}/orc_proto.pb.cc + COMMAND ${PROTOBUF_EXECUTABLE} + -I${ClickHouse_SOURCE_DIR}/contrib/orc/proto + --cpp_out="${CMAKE_CURRENT_BINARY_DIR}" + "${ClickHouse_SOURCE_DIR}/contrib/orc/proto/orc_proto.proto" +) + +set(SOURCE_FILES + "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" + ${CMAKE_CURRENT_BINARY_DIR}/orc_proto.pb.h + ${LIBRARY_DIR}/io/InputStream.cc + ${LIBRARY_DIR}/io/OutputStream.cc + ${LIBRARY_DIR}/wrap/orc-proto-wrapper.cc + ${LIBRARY_DIR}/Adaptor.cc + ${LIBRARY_DIR}/ByteRLE.cc + ${LIBRARY_DIR}/ColumnPrinter.cc + ${LIBRARY_DIR}/ColumnReader.cc + ${LIBRARY_DIR}/ColumnWriter.cc + ${LIBRARY_DIR}/Common.cc + ${LIBRARY_DIR}/Compression.cc + ${LIBRARY_DIR}/Exceptions.cc + ${LIBRARY_DIR}/Int128.cc + ${LIBRARY_DIR}/LzoDecompressor.cc + ${LIBRARY_DIR}/MemoryPool.cc + ${LIBRARY_DIR}/OrcFile.cc + ${LIBRARY_DIR}/Reader.cc + ${LIBRARY_DIR}/RLEv1.cc + ${LIBRARY_DIR}/RLEv2.cc + ${LIBRARY_DIR}/RLE.cc + ${LIBRARY_DIR}/Statistics.cc + ${LIBRARY_DIR}/StripeStream.cc + ${LIBRARY_DIR}/Timezone.cc + ${LIBRARY_DIR}/TypeImpl.cc + ${LIBRARY_DIR}/Vector.cc + ${LIBRARY_DIR}/Writer.cc + ) + +if(ORC_CXX_HAS_THREAD_LOCAL AND BUILD_LIBHDFSPP) + set(SOURCE_FILES ${SOURCE_FILES} ${LIBRARY_DIR}/OrcHdfsFile.cc) +endif(ORC_CXX_HAS_THREAD_LOCAL AND BUILD_LIBHDFSPP) + +#list(TRANSFORM SOURCE_FILES PREPEND ${LIBRARY_DIR}/) + +configure_file ( + "${LIBRARY_INCLUDE}/orc/orc-config.hh.in" + "${CMAKE_CURRENT_BINARY_DIR}/orc/orc-config.hh" + ) + +add_library (orc ${SOURCE_FILES}) + +target_include_directories (orc + PRIVATE + ${LIBRARY_INCLUDE} + ${LIBRARY_DIR} + #PUBLIC + ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE + ${PROTOBUF_INCLUDE_DIR} + ${ZLIB_INCLUDE_DIR} + ${SNAPPY_INCLUDE_DIR} + ${LZ4_INCLUDE_DIR} + ${LIBHDFSPP_INCLUDE_DIR} + ) + +target_link_libraries (orc PRIVATE + ${Protobuf_LIBRARY} + ${ZLIB_LIBRARIES} + ${SNAPPY_LIBRARY} + ${LZ4_LIBRARY} + ${LIBHDFSPP_LIBRARIES} + ) + +#install(TARGETS orc DESTINATION lib) + +if(ORC_CXX_HAS_THREAD_LOCAL AND BUILD_LIBHDFSPP) + add_definitions(-DBUILD_LIBHDFSPP) +endif(ORC_CXX_HAS_THREAD_LOCAL AND BUILD_LIBHDFSPP) From e69673df04a9f3742ae5cb521534c5e72f5557e2 Mon Sep 17 00:00:00 2001 From: proller Date: Fri, 6 Sep 2019 21:37:44 +0300 Subject: [PATCH 018/102] Missing files --- CMakeLists.txt | 2 +- cmake/find_hdfs3.cmake | 11 ++--- cmake/find_orc.cmake | 41 +++++++++++++++---- cmake/find_parquet.cmake | 1 + contrib/CMakeLists.txt | 13 ++---- contrib/arrow-cmake/CMakeLists.txt | 24 ++++++----- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 2 +- .../Formats/Impl/ArrowColumnToCHColumn.h | 2 +- 8 files changed, 60 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5330c8daeb5..578e25b8e16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,7 +343,7 @@ include (cmake/find_hyperscan.cmake) include (cmake/find_simdjson.cmake) include (cmake/find_rapidjson.cmake) include (cmake/find_fastops.cmake) -include (cmake/find_orc.cmake) +#include (cmake/find_orc.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) diff --git a/cmake/find_hdfs3.cmake b/cmake/find_hdfs3.cmake index 4c29047fc75..1f2da09f7b4 100644 --- a/cmake/find_hdfs3.cmake +++ b/cmake/find_hdfs3.cmake @@ -1,15 +1,16 @@ if (NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) - option (ENABLE_HDFS "Enable HDFS" ${NOT_UNBUNDLED}) + option (ENABLE_HDFS "Enable HDFS" 1) endif () + +if (ENABLE_HDFS) +option(USE_INTERNAL_HDFS3_LIBRARY "Set to FALSE to use system HDFS3 instead of bundled" ${NOT_UNBUNDLED}) + if (ENABLE_HDFS AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/hdfs.h") message (WARNING "submodule contrib/libhdfs3 is missing. to fix try run: \n git submodule update --init --recursive") - set (ENABLE_HDFS 0) + set (USE_INTERNAL_HDFS3_LIBRARY 0) endif () -if (ENABLE_HDFS) -option (USE_INTERNAL_HDFS3_LIBRARY "Set to FALSE to use system HDFS3 instead of bundled" ON) - if (NOT USE_INTERNAL_HDFS3_LIBRARY) find_package(hdfs3) endif () diff --git a/cmake/find_orc.cmake b/cmake/find_orc.cmake index b9f758c8090..50e563b04b4 100644 --- a/cmake/find_orc.cmake +++ b/cmake/find_orc.cmake @@ -1,15 +1,38 @@ -##TODO replace hardcode to find procedure +option (ENABLE_ORC "Enable ORC" 1) -set(USE_ORC 0) -set(USE_INTERNAL_ORC_LIBRARY ON) +if(ENABLE_ORC) +option (USE_INTERNAL_ORC_LIBRARY "Set to FALSE to use system ORC instead of bundled" ${NOT_UNBUNDLED}) -if (ARROW_LIBRARY) +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include/orc/OrcFile.hh") + if(USE_INTERNAL_ORC_LIBRARY) + message(WARNING "submodule contrib/orc is missing. to fix try run: \n git submodule update --init --recursive") + set(USE_INTERNAL_ORC_LIBRARY 0) + endif() + set(MISSING_INTERNAL_ORC_LIBRARY 1) +endif () + +if (NOT USE_INTERNAL_ORC_LIBRARY) + find_package(orc) +endif () + +#if (USE_INTERNAL_ORC_LIBRARY) +#find_path(CYRUS_SASL_INCLUDE_DIR sasl/sasl.h) +#find_library(CYRUS_SASL_SHARED_LIB sasl2) +#if (NOT CYRUS_SASL_INCLUDE_DIR OR NOT CYRUS_SASL_SHARED_LIB) +# set(USE_ORC 0) +#endif() +#endif() + +if (ORC_LIBRARY AND ORC_INCLUDE_DIR) set(USE_ORC 1) +elseif(NOT MISSING_INTERNAL_ORC_LIBRARY AND ARROW_LIBRARY) # (LIBGSASL_LIBRARY AND LIBXML2_LIBRARY) + set(ORC_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include") + set(ORC_LIBRARY orc) + set(USE_ORC 1) +else() + set(USE_INTERNAL_ORC_LIBRARY 0) endif() - -find_path(CYRUS_SASL_INCLUDE_DIR sasl/sasl.h) -find_library(CYRUS_SASL_SHARED_LIB sasl2) -if (NOT CYRUS_SASL_INCLUDE_DIR OR NOT CYRUS_SASL_SHARED_LIB) - set(USE_ORC 0) endif() + +message (STATUS "Using internal=${USE_INTERNAL_ORC_LIBRARY} orc=${USE_ORC}: ${ORC_INCLUDE_DIR} : ${ORC_LIBRARY}") diff --git a/cmake/find_parquet.cmake b/cmake/find_parquet.cmake index 63f589a9ea5..5c5bc664113 100644 --- a/cmake/find_parquet.cmake +++ b/cmake/find_parquet.cmake @@ -62,6 +62,7 @@ elseif(NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT OS_FREEBSD) endif() set(USE_PARQUET 1) + set(USE_ORC 1) endif() endif() diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index ed537f755a1..54fdc4d69e0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -10,15 +10,6 @@ endif () set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1) -if (USE_INTERNAL_ORC_LIBRARY) - set(BUILD_JAVA OFF CACHE INTERNAL "") - set(ANALYZE_JAVA OFF CACHE INTERNAL "") - set(BUILD_CPP_TESTS OFF CACHE INTERNAL "") - set(BUILD_TOOLS OFF CACHE INTERNAL "") - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/contrib/orc/cmake_modules") - add_subdirectory(orc) -endif() - if (USE_INTERNAL_BOOST_LIBRARY) add_subdirectory (boost-cmake) endif () @@ -323,3 +314,7 @@ endif() if (USE_FASTOPS) add_subdirectory (fastops-cmake) endif() + +#if (USE_INTERNAL_ORC_LIBRARY) +# add_subdirectory(orc-cmake) +#endif () diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index ba1ddc2414a..cfd57f2b296 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -56,11 +56,11 @@ set(ORC_SOURCE_WRAP_DIR ${ORC_SOURCE_DIR}/wrap) set(ORC_BUILD_SRC_DIR ${CMAKE_CURRENT_BINARY_DIR}/../orc/c++/src) set(ORC_BUILD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/../orc/c++/include) -set(GOOGLE_PROTOBUF_DIR ${ClickHouse_SOURCE_DIR}/contrib/protobuf/src/) +set(GOOGLE_PROTOBUF_DIR ${Protobuf_INCLUDE_DIR}/) set(ORC_ADDITION_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(ARROW_SRC_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src) -set(PROTOBUF_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/../protobuf/cmake/protoc) +set(PROTOBUF_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE}) set(PROTO_DIR ${ORC_SOURCE_DIR}/../proto) @@ -70,14 +70,10 @@ add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc --cpp_out="${CMAKE_CURRENT_BINARY_DIR}" "${PROTO_DIR}/orc_proto.proto") -include_directories(SYSTEM ${ORC_INCLUDE_DIR}) -include_directories(SYSTEM ${ORC_SOURCE_SRC_DIR}) -include_directories(SYSTEM ${ORC_SOURCE_WRAP_DIR}) -include_directories(SYSTEM ${GOOGLE_PROTOBUF_DIR}) -include_directories(SYSTEM ${ORC_BUILD_SRC_DIR}) -include_directories(SYSTEM ${ORC_BUILD_INCLUDE_DIR}) -include_directories(SYSTEM ${ORC_ADDITION_SOURCE_DIR}) -include_directories(SYSTEM ${ARROW_SRC_DIR}) +include(${ClickHouse_SOURCE_DIR}/contrib/orc/cmake_modules/CheckSourceCompiles.cmake) +include(orc_check.cmake) +configure_file("${ORC_INCLUDE_DIR}/orc/orc-config.hh.in" "${ORC_BUILD_INCLUDE_DIR}/orc/orc-config.hh") +configure_file("${ORC_SOURCE_SRC_DIR}/Adaptor.hh.in" "${ORC_BUILD_INCLUDE_DIR}/Adaptor.hh") set(ORC_SRCS @@ -232,6 +228,14 @@ if (ARROW_WITH_ZSTD) target_link_libraries(${ARROW_LIBRARY} PRIVATE ${ZSTD_LIBRARY}) endif() +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_INCLUDE_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_SOURCE_SRC_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_SOURCE_WRAP_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${GOOGLE_PROTOBUF_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_BUILD_SRC_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_BUILD_INCLUDE_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_ADDITION_SOURCE_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ARROW_SRC_DIR}) # === parquet diff --git a/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 0cd5ffb03e0..edb8d5c15f4 100644 --- a/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1,7 +1,7 @@ #include "config_formats.h" #include "ArrowColumnToCHColumn.h" -#if USE_ORC or USE_PARQUET +#if USE_ORC || USE_PARQUET #include #include #include diff --git a/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index b5f4732d107..34b58a80091 100644 --- a/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/dbms/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -1,6 +1,6 @@ #include "config_formats.h" -#if USE_ORC or USE_PARQUET +#if USE_ORC || USE_PARQUET #include #include From 9b722522e4f5a2e0312d6989ef163f878979e74b Mon Sep 17 00:00:00 2001 From: proller Date: Fri, 6 Sep 2019 23:15:00 +0300 Subject: [PATCH 019/102] Try fix --- cmake/find_hdfs3.cmake | 6 +++--- contrib/libhdfs3-cmake/CMakeLists.txt | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/find_hdfs3.cmake b/cmake/find_hdfs3.cmake index 1f2da09f7b4..ab7c28c6c1e 100644 --- a/cmake/find_hdfs3.cmake +++ b/cmake/find_hdfs3.cmake @@ -2,17 +2,17 @@ if (NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) option (ENABLE_HDFS "Enable HDFS" 1) endif () - if (ENABLE_HDFS) option(USE_INTERNAL_HDFS3_LIBRARY "Set to FALSE to use system HDFS3 instead of bundled" ${NOT_UNBUNDLED}) -if (ENABLE_HDFS AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/hdfs.h") +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/hdfs.h") message (WARNING "submodule contrib/libhdfs3 is missing. to fix try run: \n git submodule update --init --recursive") set (USE_INTERNAL_HDFS3_LIBRARY 0) endif () if (NOT USE_INTERNAL_HDFS3_LIBRARY) - find_package(hdfs3) + find_library(HDFS3_LIBRARY hdfs3) + find_path(HDFS3_INCLUDE_DIR NAMES hdfs/hdfs.h PATHS ${HDFS3_INCLUDE_PATHS}) endif () if (HDFS3_LIBRARY AND HDFS3_INCLUDE_DIR) diff --git a/contrib/libhdfs3-cmake/CMakeLists.txt b/contrib/libhdfs3-cmake/CMakeLists.txt index 8ec14f897b9..e1ba7225b0f 100644 --- a/contrib/libhdfs3-cmake/CMakeLists.txt +++ b/contrib/libhdfs3-cmake/CMakeLists.txt @@ -199,17 +199,17 @@ if (WITH_KERBEROS) endif() target_include_directories(hdfs3 PRIVATE ${LIBXML2_INCLUDE_DIR}) -target_link_libraries(hdfs3 ${LIBGSASL_LIBRARY}) +target_link_libraries(hdfs3 PRIVATE ${LIBGSASL_LIBRARY}) if (WITH_KERBEROS) - target_link_libraries(hdfs3 ${KERBEROS_LIBRARIES}) + target_link_libraries(hdfs3 PRIVATE ${KERBEROS_LIBRARIES}) endif() -target_link_libraries(hdfs3 ${LIBXML2_LIBRARY}) +target_link_libraries(hdfs3 PRIVATE ${LIBXML2_LIBRARY}) # inherit from parent cmake target_include_directories(hdfs3 PRIVATE ${Boost_INCLUDE_DIRS}) target_include_directories(hdfs3 PRIVATE ${Protobuf_INCLUDE_DIR}) -target_link_libraries(hdfs3 ${Protobuf_LIBRARY}) +target_link_libraries(hdfs3 PRIVATE ${Protobuf_LIBRARY}) if(OPENSSL_INCLUDE_DIR AND OPENSSL_LIBRARIES) target_include_directories(hdfs3 PRIVATE ${OPENSSL_INCLUDE_DIR}) - target_link_libraries(hdfs3 ${OPENSSL_LIBRARIES}) + target_link_libraries(hdfs3 PRIVATE ${OPENSSL_LIBRARIES}) endif() From 6f35d6946377538d7aa57125d0274051f87838d2 Mon Sep 17 00:00:00 2001 From: proller Date: Mon, 9 Sep 2019 16:12:54 +0300 Subject: [PATCH 020/102] fix hdfs --- cmake/find_hdfs3.cmake | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/cmake/find_hdfs3.cmake b/cmake/find_hdfs3.cmake index ab7c28c6c1e..9c593d3266a 100644 --- a/cmake/find_hdfs3.cmake +++ b/cmake/find_hdfs3.cmake @@ -1,25 +1,29 @@ -if (NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) - option (ENABLE_HDFS "Enable HDFS" 1) -endif () +if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) + option(ENABLE_HDFS "Enable HDFS" 1) +endif() -if (ENABLE_HDFS) +if(ENABLE_HDFS) option(USE_INTERNAL_HDFS3_LIBRARY "Set to FALSE to use system HDFS3 instead of bundled" ${NOT_UNBUNDLED}) -if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/hdfs.h") - message (WARNING "submodule contrib/libhdfs3 is missing. to fix try run: \n git submodule update --init --recursive") - set (USE_INTERNAL_HDFS3_LIBRARY 0) -endif () +if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/hdfs.h") + if(USE_INTERNAL_HDFS3_LIBRARY) + message(WARNING "submodule contrib/libhdfs3 is missing. to fix try run: \n git submodule update --init --recursive") + endif() + set(MISSING_INTERNAL_HDFS3_LIBRARY 1) + set(USE_INTERNAL_HDFS3_LIBRARY 0) +endif() -if (NOT USE_INTERNAL_HDFS3_LIBRARY) +if(NOT USE_INTERNAL_HDFS3_LIBRARY) find_library(HDFS3_LIBRARY hdfs3) find_path(HDFS3_INCLUDE_DIR NAMES hdfs/hdfs.h PATHS ${HDFS3_INCLUDE_PATHS}) -endif () +endif() -if (HDFS3_LIBRARY AND HDFS3_INCLUDE_DIR) +if(HDFS3_LIBRARY AND HDFS3_INCLUDE_DIR) set(USE_HDFS 1) -elseif (LIBGSASL_LIBRARY AND LIBXML2_LIBRARY) +elseif(NOT MISSING_INTERNAL_HDFS3_LIBRARY AND LIBGSASL_LIBRARY AND LIBXML2_LIBRARY) set(HDFS3_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include") set(HDFS3_LIBRARY hdfs3) + set(USE_INTERNAL_HDFS3_LIBRARY 1) set(USE_HDFS 1) else() set(USE_INTERNAL_HDFS3_LIBRARY 0) @@ -27,4 +31,4 @@ endif() endif() -message (STATUS "Using hdfs3=${USE_HDFS}: ${HDFS3_INCLUDE_DIR} : ${HDFS3_LIBRARY}") +message(STATUS "Using hdfs3=${USE_HDFS}: ${HDFS3_INCLUDE_DIR} : ${HDFS3_LIBRARY}") From cec1cf068d690bd168270a2b2309dd634bfedd3f Mon Sep 17 00:00:00 2001 From: proller Date: Mon, 9 Sep 2019 18:37:17 +0300 Subject: [PATCH 021/102] Fix llvm 7.1 find --- cmake/find_llvm.cmake | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/cmake/find_llvm.cmake b/cmake/find_llvm.cmake index 3692a98b979..c668416c0c0 100644 --- a/cmake/find_llvm.cmake +++ b/cmake/find_llvm.cmake @@ -18,22 +18,12 @@ if (ENABLE_EMBEDDED_COMPILER) elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") find_package(LLVM ${CMAKE_CXX_COMPILER_VERSION} CONFIG PATHS ${LLVM_PATHS}) else () - #TODO: - #if(NOT LLVM_FOUND) - # find_package(LLVM 9 CONFIG PATHS ${LLVM_PATHS}) - #endif() - #if(NOT LLVM_FOUND) - # find_package(LLVM 8 CONFIG PATHS ${LLVM_PATHS}) - #endif() - if (NOT LLVM_FOUND) - find_package (LLVM 7 CONFIG PATHS ${LLVM_PATHS}) - endif () - if (NOT LLVM_FOUND) - find_package (LLVM 6 CONFIG PATHS ${LLVM_PATHS}) - endif () - if (NOT LLVM_FOUND) - find_package (LLVM 5 CONFIG PATHS ${LLVM_PATHS}) - endif () + # TODO: 9 8 + foreach(llvm_v 7.1 7 6 5) + if (NOT LLVM_FOUND) + find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS}) + endif () + endforeach () endif () if (LLVM_FOUND) From 1b313eedfadd841b0033fde91118e3fee01f9c88 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 9 Sep 2019 22:43:37 +0300 Subject: [PATCH 022/102] MergeJoin in progress --- dbms/src/Core/Settings.h | 1 + .../CreatingSetsBlockInputStream.cpp | 11 +---- dbms/src/Functions/FunctionJoinGet.h | 9 ++-- dbms/src/Interpreters/AnalyzedJoin.cpp | 41 +++++++++---------- dbms/src/Interpreters/AnalyzedJoin.h | 41 +++++++++++++++---- dbms/src/Interpreters/ExpressionActions.cpp | 6 +-- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 33 +++++++-------- dbms/src/Interpreters/ExpressionAnalyzer.h | 8 +--- dbms/src/Interpreters/IJoin.h | 32 +++++++++++++++ dbms/src/Interpreters/Join.cpp | 29 ++++++------- dbms/src/Interpreters/Join.h | 36 +++++++--------- dbms/src/Interpreters/MergeJoin.cpp | 40 ++++++++++++++++++ dbms/src/Interpreters/MergeJoin.h | 36 ++++++++++++++++ dbms/src/Interpreters/SubqueryForSet.cpp | 24 +++++++++++ dbms/src/Interpreters/SubqueryForSet.h | 14 ++++--- dbms/src/Interpreters/SyntaxAnalyzer.cpp | 3 +- .../Transforms/CreatingSetsTransform.cpp | 7 +--- dbms/src/Storages/StorageJoin.cpp | 10 ++--- dbms/src/Storages/StorageJoin.h | 8 ++-- 19 files changed, 262 insertions(+), 127 deletions(-) create mode 100644 dbms/src/Interpreters/IJoin.h create mode 100644 dbms/src/Interpreters/MergeJoin.cpp create mode 100644 dbms/src/Interpreters/MergeJoin.h diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 0678aaeedc6..7798a5b062b 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -287,6 +287,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, max_bytes_in_join, 0, "Maximum size of the hash table for JOIN (in number of bytes in memory).") \ M(SettingOverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.") \ M(SettingBool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.") \ + M(SettingBool, prefer_merge_join, false, "Use merge join algorithm instead of hash join if possible.") \ \ M(SettingUInt64, max_rows_to_transfer, 0, "Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ M(SettingUInt64, max_bytes_to_transfer, 0, "Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ diff --git a/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp b/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp index 9255527d072..14912b4ebc3 100644 --- a/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp +++ b/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -124,12 +123,7 @@ void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) if (!done_with_join) { - subquery.renameColumns(block); - - if (subquery.joined_block_actions) - subquery.joined_block_actions->execute(block); - - if (!subquery.join->insertFromBlock(block)) + if (!subquery.insertJoinedBlock(block)) done_with_join = true; } @@ -162,8 +156,7 @@ void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) head_rows = profile_info.rows; - if (subquery.join) - subquery.join->setTotals(subquery.source->getTotals()); + subquery.setTotals(); if (head_rows != 0) { diff --git a/dbms/src/Functions/FunctionJoinGet.h b/dbms/src/Functions/FunctionJoinGet.h index 9885b05657d..a81981fe624 100644 --- a/dbms/src/Functions/FunctionJoinGet.h +++ b/dbms/src/Functions/FunctionJoinGet.h @@ -4,17 +4,18 @@ namespace DB { + class Context; class Join; -using JoinPtr = std::shared_ptr; +using HashJoinPtr = std::shared_ptr; class FunctionJoinGet final : public IFunction { public: static constexpr auto name = "joinGet"; - FunctionJoinGet( - TableStructureReadLockHolder table_lock_, StoragePtr storage_join_, JoinPtr join_, const String & attr_name_, DataTypePtr return_type_) + FunctionJoinGet(TableStructureReadLockHolder table_lock_, StoragePtr storage_join_, HashJoinPtr join_, const String & attr_name_, + DataTypePtr return_type_) : table_lock(std::move(table_lock_)) , storage_join(std::move(storage_join_)) , join(std::move(join_)) @@ -36,7 +37,7 @@ private: private: TableStructureReadLockHolder table_lock; StoragePtr storage_join; - JoinPtr join; + HashJoinPtr join; const String attr_name; DataTypePtr return_type; }; diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 7deb21d0dcc..02e75f3a342 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -2,11 +2,13 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -16,6 +18,17 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +AnalyzedJoin::AnalyzedJoin(const Settings & settings) + : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) + , join_use_nulls(settings.join_use_nulls) + , prefer_merge_join(settings.prefer_merge_join) +{} + void AnalyzedJoin::addUsingKey(const ASTPtr & ast) { key_names_left.push_back(ast->getColumnName()); @@ -210,36 +223,22 @@ bool AnalyzedJoin::sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y) && x->key_names_left == y->key_names_left && x->key_names_right == y->key_names_right && x->columns_added_by_join == y->columns_added_by_join - && x->hash_join == y->hash_join; + && x->join == y->join; } BlockInputStreamPtr AnalyzedJoin::createStreamWithNonJoinedDataIfFullOrRightJoin(const Block & source_header, UInt64 max_block_size) const { if (isRightOrFull(table_join.kind)) - return hash_join->createStreamWithNonJoinedRows(source_header, *this, max_block_size); + if (auto hash_join = typeid_cast(join.get())) + return hash_join->createStreamWithNonJoinedRows(source_header, *this, max_block_size); return {}; } -JoinPtr AnalyzedJoin::makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const +JoinPtr AnalyzedJoin::makeJoin(const Block & right_sample_block) const { - auto join = std::make_shared(key_names_right, join_use_nulls, size_limits_for_join, table_join.kind, table_join.strictness); - join->setSampleBlock(sample_block); - return join; -} - -void AnalyzedJoin::joinBlock(Block & block) const -{ - hash_join->joinBlock(block, *this); -} - -void AnalyzedJoin::joinTotals(Block & block) const -{ - hash_join->joinTotals(block); -} - -bool AnalyzedJoin::hasTotals() const -{ - return hash_join->hasTotals(); + if (prefer_merge_join) + return std::make_shared(*this, right_sample_block); + return std::make_shared(*this, right_sample_block); } NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index bea430de479..960457dddd9 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #include @@ -17,8 +19,7 @@ class ASTSelectQuery; struct DatabaseAndTableWithAlias; class Block; -class Join; -using JoinPtr = std::shared_ptr; +struct Settings; class AnalyzedJoin { @@ -36,12 +37,15 @@ class AnalyzedJoin friend class SyntaxAnalyzer; + const SizeLimits size_limits; + const bool join_use_nulls; + const bool prefer_merge_join; + Names key_names_left; Names key_names_right; /// Duplicating names are qualified. ASTs key_asts_left; ASTs key_asts_right; ASTTableJoin table_join; - bool join_use_nulls = false; /// All columns which can be read from joined table. Duplicating names are qualified. NamesAndTypesList columns_from_joined_table; @@ -53,9 +57,28 @@ class AnalyzedJoin /// Original name -> name. Only ranamed columns. std::unordered_map renames; - JoinPtr hash_join; + JoinPtr join; public: + AnalyzedJoin(const Settings &); + + /// for StorageJoin + AnalyzedJoin(SizeLimits limits, bool use_nulls, ASTTableJoin::Kind kind, ASTTableJoin::Strictness strictness, + const Names & key_names_right_) + : size_limits(limits) + , join_use_nulls(use_nulls) + , prefer_merge_join(false) + , key_names_right(key_names_right_) + { + table_join.kind = kind; + table_join.strictness = strictness; + } + + ASTTableJoin::Kind kind() const { return table_join.kind; } + ASTTableJoin::Strictness strictness() const { return table_join.strictness; } + const SizeLimits & sizeLimits() const { return size_limits; } + bool joinUseNulls() const { return join_use_nulls; } + void addUsingKey(const ASTPtr & ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); @@ -78,15 +101,15 @@ public: Names requiredJoinedNames() const; const Names & keyNamesLeft() const { return key_names_left; } + const Names & keyNamesRight() const { return key_names_right; } const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; } const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; } - void setHashJoin(JoinPtr join) { hash_join = join; } - JoinPtr makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const; + JoinPtr getJoin() const { return join; } + void setJoin(const JoinPtr & join_) { join = join_; } + + JoinPtr makeJoin(const Block & right_sample_block) const; BlockInputStreamPtr createStreamWithNonJoinedDataIfFullOrRightJoin(const Block & source_header, UInt64 max_block_size) const; - void joinBlock(Block & block) const; - void joinTotals(Block & block) const; - bool hasTotals() const; static bool sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y); }; diff --git a/dbms/src/Interpreters/ExpressionActions.cpp b/dbms/src/Interpreters/ExpressionActions.cpp index d6c38417899..3638fba687d 100644 --- a/dbms/src/Interpreters/ExpressionActions.cpp +++ b/dbms/src/Interpreters/ExpressionActions.cpp @@ -475,7 +475,7 @@ void ExpressionAction::execute(Block & block, bool dry_run) const case JOIN: { - table_join->joinBlock(block); + table_join->getJoin()->joinBlock(block); break; } @@ -543,7 +543,7 @@ void ExpressionAction::executeOnTotals(Block & block) const if (type != JOIN) execute(block, false); else - table_join->joinTotals(block); + table_join->getJoin()->joinTotals(block); } @@ -763,7 +763,7 @@ void ExpressionActions::execute(Block & block, bool dry_run) const bool ExpressionActions::hasTotalsInJoin() const { for (const auto & action : actions) - if (action.table_join && action.table_join->hasTotals()) + if (action.table_join && action.table_join->getJoin()->hasTotals()) return true; return false; } diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 9777e3d508d..2a87cef1152 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -470,34 +471,34 @@ void SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryEl auto join_hash = join_element.getTreeHash(); String join_subquery_id = toString(join_hash.first) + "_" + toString(join_hash.second); - SubqueryForSet & subquery_for_set = subqueries_for_sets[join_subquery_id]; + SubqueryForSet & subquery_for_join = subqueries_for_sets[join_subquery_id]; /// Special case - if table name is specified on the right of JOIN, then the table has the type Join (the previously prepared mapping). - if (!subquery_for_set.join) - subquery_for_set.join = tryGetStorageJoin(join_element, context); + if (!subquery_for_join.join) + subquery_for_join.join = tryGetStorageJoin(join_element, context); - if (!subquery_for_set.join) + if (!subquery_for_join.join) { /// Actions which need to be calculated on joined block. ExpressionActionsPtr joined_block_actions = createJoinedBlockActions(context, analyzedJoin()); - if (!subquery_for_set.source) - makeSubqueryForJoin(join_element, joined_block_actions, subquery_for_set); - - /// Test actions on sample block (early error detection) - Block sample_block = subquery_for_set.renamedSampleBlock(); - joined_block_actions->execute(sample_block); + if (!subquery_for_join.source) + { + NamesWithAliases required_columns_with_aliases = + analyzedJoin().getRequiredColumns(joined_block_actions->getSampleBlock(), joined_block_actions->getRequiredColumns()); + makeSubqueryForJoin(join_element, std::move(required_columns_with_aliases), subquery_for_join); + } /// TODO You do not need to set this up when JOIN is only needed on remote servers. - subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join); - subquery_for_set.joined_block_actions = joined_block_actions; + subquery_for_join.setJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside + subquery_for_join.join = analyzedJoin().makeJoin(subquery_for_join.sample_block); } - syntax->analyzed_join->setHashJoin(subquery_for_set.join); + syntax->analyzed_join->setJoin(subquery_for_join.join); } void SelectQueryExpressionAnalyzer::makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, - const ExpressionActionsPtr & joined_block_actions, + NamesWithAliases && required_columns_with_aliases, SubqueryForSet & subquery_for_set) const { /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs @@ -505,10 +506,6 @@ void SelectQueryExpressionAnalyzer::makeSubqueryForJoin(const ASTTablesInSelectQ * in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`. * - this function shows the expression JOIN _data1. */ - - NamesWithAliases required_columns_with_aliases = - analyzedJoin().getRequiredColumns(joined_block_actions->getSampleBlock(), joined_block_actions->getRequiredColumns()); - Names original_columns; for (auto & pr : required_columns_with_aliases) original_columns.push_back(pr.first); diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.h b/dbms/src/Interpreters/ExpressionAnalyzer.h index 9356046aee3..33c974e29d9 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.h +++ b/dbms/src/Interpreters/ExpressionAnalyzer.h @@ -58,15 +58,11 @@ private: struct ExtractedSettings { const bool use_index_for_in_with_subqueries; - const bool join_use_nulls; const SizeLimits size_limits_for_set; - const SizeLimits size_limits_for_join; ExtractedSettings(const Settings & settings_) : use_index_for_in_with_subqueries(settings_.use_index_for_in_with_subqueries), - join_use_nulls(settings_.join_use_nulls), - size_limits_for_set(settings_.max_rows_in_set, settings_.max_bytes_in_set, settings_.set_overflow_mode), - size_limits_for_join(settings_.max_rows_in_join, settings_.max_bytes_in_join, settings_.join_overflow_mode) + size_limits_for_set(settings_.max_rows_in_set, settings_.max_bytes_in_set, settings_.set_overflow_mode) {} }; @@ -220,7 +216,7 @@ private: void tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name); void makeTableJoin(const ASTTablesInSelectQueryElement & join_element); - void makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, const ExpressionActionsPtr & joined_block_actions, + void makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, NamesWithAliases && required_columns_with_aliases, SubqueryForSet & subquery_for_set) const; const ASTSelectQuery * getAggregatingQuery() const; diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h new file mode 100644 index 00000000000..db28845de93 --- /dev/null +++ b/dbms/src/Interpreters/IJoin.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +class Block; + +class IJoin +{ +public: + virtual ~IJoin() = default; + + /// Add block of data from right hand of JOIN. + /// @returns false, if some limit was exceeded and you should not insert more data. + virtual bool addJoinedBlock(const Block & block) = 0; + + /// Join the block with data from left hand of JOIN to the right hand data (that was previously built by calls to addJoinedBlock). + /// Could be called from different threads in parallel. + virtual void joinBlock(Block & block) = 0; + + virtual bool hasTotals() const { return false; } + virtual void setTotals(const Block & block) = 0; + virtual void joinTotals(Block & block) const = 0; + + virtual size_t getTotalRowCount() const { return 0; } +}; + +using JoinPtr = std::shared_ptr; + +} diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 855b0d284e1..cb4d68663ac 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -83,15 +83,16 @@ static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, } -Join::Join(const Names & key_names_right_, bool use_nulls_, const SizeLimits & limits_, - ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_, bool any_take_last_row_) - : kind(kind_), strictness(strictness_), - key_names_right(key_names_right_), - use_nulls(use_nulls_), - any_take_last_row(any_take_last_row_), - log(&Logger::get("Join")), - limits(limits_) +Join::Join(const AnalyzedJoin & join_options_, const Block & right_sample_block, bool any_take_last_row_) + : join_options(join_options_) + , kind(join_options_.kind()) + , strictness(join_options_.strictness()) + , key_names_right(join_options_.keyNamesRight()) + , use_nulls(join_options_.joinUseNulls()) + , any_take_last_row(any_take_last_row_) + , log(&Logger::get("Join")) { + setSampleBlock(right_sample_block); } @@ -269,7 +270,7 @@ size_t Join::getTotalByteCount() const void Join::setSampleBlock(const Block & block) { - std::unique_lock lock(rwlock); + //std::unique_lock lock(rwlock); LOG_DEBUG(log, "setSampleBlock: " << block.dumpStructure()); if (!empty()) @@ -504,7 +505,7 @@ void Join::prepareBlockListStructure(Block & stored_block) } } -bool Join::insertFromBlock(const Block & block) +bool Join::addJoinedBlock(const Block & block) { std::unique_lock lock(rwlock); @@ -570,7 +571,7 @@ bool Join::insertFromBlock(const Block & block) blocks_nullmaps.emplace_back(stored_block, null_map_holder); } - return limits.check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); + return join_options.sizeLimits().check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); } @@ -1049,10 +1050,10 @@ void Join::joinGet(Block & block, const String & column_name) const } -void Join::joinBlock(Block & block, const AnalyzedJoin & join_params) const +void Join::joinBlock(Block & block) { - const Names & key_names_left = join_params.keyNamesLeft(); - const NamesAndTypesList & columns_added_by_join = join_params.columnsAddedByJoin(); + const Names & key_names_left = join_options.keyNamesLeft(); + const NamesAndTypesList & columns_added_by_join = join_options.columnsAddedByJoin(); std::shared_lock lock(rwlock); diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 6ae69155920..ec3fc7b045a 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -7,6 +7,7 @@ #include +#include #include #include #include @@ -120,30 +121,24 @@ using MappedAsof = WithFlags; * If it is true, we always generate Nullable column and substitute NULLs for non-joined rows, * as in standard SQL. */ -class Join +class Join : public IJoin { public: - Join(const Names & key_names_right_, bool use_nulls_, const SizeLimits & limits_, - ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_, bool any_take_last_row_ = false); + Join(const AnalyzedJoin & join_options, const Block & right_sample_block, bool any_take_last_row_ = false); bool empty() { return type == Type::EMPTY; } bool isNullUsedAsDefault() const { return use_nulls; } - /** Set information about structure of right hand of JOIN (joined data). - * You must call this method before subsequent calls to insertFromBlock. - */ - void setSampleBlock(const Block & block); - /** Add block of data from right hand of JOIN to the map. * Returns false, if some limit was exceeded and you should not insert more data. */ - bool insertFromBlock(const Block & block); + bool addJoinedBlock(const Block & block) override; - /** Join data from the map (that was previously built by calls to insertFromBlock) to the block with data from "left" table. + /** Join data from the map (that was previously built by calls to addJoinedBlock) to the block with data from "left" table. * Could be called from different threads in parallel. */ - void joinBlock(Block & block, const AnalyzedJoin & join_params) const; + void joinBlock(Block & block) override; /// Infer the return type for joinGet function DataTypePtr joinGetReturnType(const String & column_name) const; @@ -153,10 +148,10 @@ public: /** Keep "totals" (separate part of dataset, see WITH TOTALS) to use later. */ - void setTotals(const Block & block) { totals = block; } - bool hasTotals() const { return totals; } + void setTotals(const Block & block) override { totals = block; } + bool hasTotals() const override { return totals; } - void joinTotals(Block & block) const; + void joinTotals(Block & block) const override; /** For RIGHT and FULL JOINs. * A stream that will contain default values from left table, joined with rows from right table, that was not joined before. @@ -167,7 +162,7 @@ public: UInt64 max_block_size) const; /// Number of keys in all built JOIN maps. - size_t getTotalRowCount() const; + size_t getTotalRowCount() const override; /// Sum size in bytes of all buffers, used for JOIN maps and for all memory pools. size_t getTotalByteCount() const; @@ -282,6 +277,7 @@ private: friend class NonJoinedBlockInputStream; friend class JoinBlockInputStream; + const AnalyzedJoin & join_options; ASTTableJoin::Kind kind; ASTTableJoin::Strictness strictness; @@ -323,9 +319,6 @@ private: Poco::Logger * log; - /// Limits for maximum map size. - SizeLimits limits; - Block totals; /** Protect state for concurrent use in insertFromBlock and joinBlock. @@ -337,6 +330,10 @@ private: void init(Type type_); + /** Set information about structure of right hand of JOIN (joined data). + */ + void setSampleBlock(const Block & block); + /** Take an inserted block and discard everything that does not need to be stored * Example, remove the keys as they come from the LHS block, but do keep the ASOF timestamps */ @@ -359,7 +356,4 @@ private: void joinGetImpl(Block & block, const String & column_name, const Maps & maps) const; }; -using JoinPtr = std::shared_ptr; -using Joins = std::vector; - } diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp new file mode 100644 index 00000000000..a2f6d98ef66 --- /dev/null +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -0,0 +1,40 @@ +#include +#include +#include +#include + +namespace DB +{ + +MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) + : table_join(table_join_) + , sample_block_with_columns_to_add(materializeBlock(right_sample_block)) +{ + for (auto & column : table_join.columnsAddedByJoin()) + sample_block_with_columns_to_add.getByName(column.name); +} + +void MergeJoin::joinBlocks(const Block & src_block, Block & dst_block, size_t & src_row) +{ + for (auto it = right_blocks.begin(); it != right_blocks.end();) + { + join(src_block, *it, dst_block, src_row); + if (src_row == src_block.rows()) + return; + + it = right_blocks.erase(it); + } +} + +void MergeJoin::join(const Block & left_block, const Block & /*right_block*/, Block & dst_block, size_t & src_row) +{ + for (auto & column : left_block) + dst_block.insert(column); + + for (const auto & column : sample_block_with_columns_to_add) + dst_block.insert(ColumnWithTypeAndName{column.column->cloneResized(src_row), column.type, column.name}); + + src_row = left_block.rows(); +} + +} diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h new file mode 100644 index 00000000000..4c0fe5f46c1 --- /dev/null +++ b/dbms/src/Interpreters/MergeJoin.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include +#include + + +namespace DB +{ + +class AnalyzedJoin; + +class MergeJoin : public IJoin +{ +public: + MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block); + + bool addJoinedBlock(const Block &) override { return false; } + void joinBlock(Block &) override {} + void joinTotals(Block &) const override {} + void setTotals(const Block &) override {} + + void joinBlocks(const Block & src_block, Block & dst_block, size_t & src_row); + size_t rightBlocksCount() const { return right_blocks.size(); } + void addRightBlock(const Block & block) { right_blocks.push_back(block); } + +private: + const AnalyzedJoin & table_join; + Block sample_block_with_columns_to_add; + BlocksList right_blocks; + + void join(const Block & left_block, const Block & right_block, Block & dst_block, size_t & src_row); +}; + +} diff --git a/dbms/src/Interpreters/SubqueryForSet.cpp b/dbms/src/Interpreters/SubqueryForSet.cpp index 6e0cd540db4..5a2a06cc411 100644 --- a/dbms/src/Interpreters/SubqueryForSet.cpp +++ b/dbms/src/Interpreters/SubqueryForSet.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include namespace DB @@ -31,4 +33,26 @@ void SubqueryForSet::renameColumns(Block & block) } } +void SubqueryForSet::setJoinActions(ExpressionActionsPtr actions) +{ + actions->execute(sample_block); + joined_block_actions = actions; +} + +bool SubqueryForSet::insertJoinedBlock(Block & block) +{ + renameColumns(block); + + if (joined_block_actions) + joined_block_actions->execute(block); + + return join->addJoinedBlock(block); +} + +void SubqueryForSet::setTotals() +{ + if (join && source) + join->setTotals(source->getTotals()); +} + } diff --git a/dbms/src/Interpreters/SubqueryForSet.h b/dbms/src/Interpreters/SubqueryForSet.h index abba7a4ec2f..aa510faefbc 100644 --- a/dbms/src/Interpreters/SubqueryForSet.h +++ b/dbms/src/Interpreters/SubqueryForSet.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -8,9 +9,6 @@ namespace DB { -class Join; -using JoinPtr = std::shared_ptr; - class InterpreterSelectWithUnionQuery; @@ -25,6 +23,7 @@ struct SubqueryForSet JoinPtr join; /// Apply this actions to joined block. ExpressionActionsPtr joined_block_actions; + Block sample_block; /// source->getHeader() + column renames /// If set, put the result into the table. /// This is a temporary table for transferring to remote servers for distributed query processing. @@ -33,12 +32,15 @@ struct SubqueryForSet void makeSource(std::shared_ptr & interpreter, NamesWithAliases && joined_block_aliases_); - Block renamedSampleBlock() const { return sample_block; } - void renameColumns(Block & block); + void setJoinActions(ExpressionActionsPtr actions); + + bool insertJoinedBlock(Block & block); + void setTotals(); private: NamesWithAliases joined_block_aliases; /// Rename column from joined block from this list. - Block sample_block; /// source->getHeader() + column renames + + void renameColumns(Block & block); }; /// ID of subquery -> what to do with it. diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index dd0c37c50b5..81b22379e02 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -805,8 +805,7 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze( SyntaxAnalyzerResult result; result.storage = storage; result.source_columns = source_columns_; - result.analyzed_join = std::make_shared(); /// TODO: move to select_query logic - result.analyzed_join->join_use_nulls = settings.join_use_nulls; + result.analyzed_join = std::make_shared(settings); /// TODO: move to select_query logic collectSourceColumns(select_query, result.storage, result.source_columns); NameSet source_columns_set = removeDuplicateColumns(result.source_columns); diff --git a/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp b/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp index 71fe743fd49..b6791d83723 100644 --- a/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp +++ b/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp @@ -175,12 +175,7 @@ void CreatingSetsTransform::work() if (!done_with_join) { - subquery.renameColumns(block); - - if (subquery.joined_block_actions) - subquery.joined_block_actions->execute(block); - - if (!subquery.join->insertFromBlock(block)) + if (!subquery.insertJoinedBlock(block)) done_with_join = true; } diff --git a/dbms/src/Storages/StorageJoin.cpp b/dbms/src/Storages/StorageJoin.cpp index 54effdcd4fa..80a8d16c5f6 100644 --- a/dbms/src/Storages/StorageJoin.cpp +++ b/dbms/src/Storages/StorageJoin.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include /// toLower @@ -49,8 +50,8 @@ StorageJoin::StorageJoin( if (!getColumns().hasPhysical(key)) throw Exception{"Key column (" + key + ") does not exist in table declaration.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE}; - join = std::make_shared(key_names, use_nulls, limits, kind, strictness, overwrite); - join->setSampleBlock(getSampleBlock().sortColumns()); + table_join = std::make_shared(limits, use_nulls, kind, strictness, key_names); + join = std::make_shared(*table_join, getSampleBlock().sortColumns(), overwrite); restore(); } @@ -62,8 +63,7 @@ void StorageJoin::truncate(const ASTPtr &, const Context &, TableStructureWriteL Poco::File(path + "tmp/").createDirectories(); increment = 0; - join = std::make_shared(key_names, use_nulls, limits, kind, strictness); - join->setSampleBlock(getSampleBlock().sortColumns()); + join = std::make_shared(*table_join, getSampleBlock().sortColumns()); } @@ -75,7 +75,7 @@ void StorageJoin::assertCompatible(ASTTableJoin::Kind kind_, ASTTableJoin::Stric } -void StorageJoin::insertBlock(const Block & block) { join->insertFromBlock(block); } +void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block); } size_t StorageJoin::getSize() const { return join->getTotalRowCount(); } diff --git a/dbms/src/Storages/StorageJoin.h b/dbms/src/Storages/StorageJoin.h index bdc50b9d767..d770078ac39 100644 --- a/dbms/src/Storages/StorageJoin.h +++ b/dbms/src/Storages/StorageJoin.h @@ -9,8 +9,9 @@ namespace DB { +class AnalyzedJoin; class Join; -using JoinPtr = std::shared_ptr; +using HashJoinPtr = std::shared_ptr; /** Allows you save the state for later use on the right side of the JOIN. @@ -29,7 +30,7 @@ public: void truncate(const ASTPtr &, const Context &, TableStructureWriteLockHolder &) override; /// Access the innards. - JoinPtr & getJoin() { return join; } + HashJoinPtr & getJoin() { return join; } /// Verify that the data structure is suitable for implementing this type of JOIN. void assertCompatible(ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_) const; @@ -50,7 +51,8 @@ private: ASTTableJoin::Kind kind; /// LEFT | INNER ... ASTTableJoin::Strictness strictness; /// ANY | ALL - JoinPtr join; + std::shared_ptr table_join; + HashJoinPtr join; void insertBlock(const Block & block) override; size_t getSize() const override; From 02691f50ef6115e418eb95ced627d38c61d190b1 Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 10 Sep 2019 17:51:28 +0300 Subject: [PATCH 023/102] make things wrong --- dbms/src/Interpreters/IJoin.h | 2 +- dbms/src/Interpreters/MergeJoin.cpp | 44 ++++++++++++++++++++--------- dbms/src/Interpreters/MergeJoin.h | 16 ++++++----- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index db28845de93..92c23dd4288 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -24,7 +24,7 @@ public: virtual void setTotals(const Block & block) = 0; virtual void joinTotals(Block & block) const = 0; - virtual size_t getTotalRowCount() const { return 0; } + virtual size_t getTotalRowCount() const = 0; }; using JoinPtr = std::shared_ptr; diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index a2f6d98ef66..4791b639d3f 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -6,6 +6,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int SET_SIZE_LIMIT_EXCEEDED; +} + + MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) : table_join(table_join_) , sample_block_with_columns_to_add(materializeBlock(right_sample_block)) @@ -14,27 +20,39 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl sample_block_with_columns_to_add.getByName(column.name); } -void MergeJoin::joinBlocks(const Block & src_block, Block & dst_block, size_t & src_row) +/// TODO: sort +bool MergeJoin::addJoinedBlock(const Block & block) { - for (auto it = right_blocks.begin(); it != right_blocks.end();) - { - join(src_block, *it, dst_block, src_row); - if (src_row == src_block.rows()) - return; + std::unique_lock lock(rwlock); - it = right_blocks.erase(it); - } + right_blocks.push_back(block); + right_blocks_row_count += block.rows(); + right_blocks_bytes += block.bytes(); + + return table_join.sizeLimits().check(right_blocks_row_count, right_blocks_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); } -void MergeJoin::join(const Block & left_block, const Block & /*right_block*/, Block & dst_block, size_t & src_row) +void MergeJoin::joinBlock(Block & block) { - for (auto & column : left_block) - dst_block.insert(column); + addRightColumns(block); + + std::shared_lock lock(rwlock); + + for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) + mergeJoin(block, *it); +} + +void MergeJoin::addRightColumns(Block & block) +{ + size_t rows = block.rows(); for (const auto & column : sample_block_with_columns_to_add) - dst_block.insert(ColumnWithTypeAndName{column.column->cloneResized(src_row), column.type, column.name}); + block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); +} - src_row = left_block.rows(); +void MergeJoin::mergeJoin(Block & /*block*/, const Block & /*right_block*/) +{ + /// TODO } } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 4c0fe5f46c1..1b5d45f3f5c 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -16,21 +17,22 @@ class MergeJoin : public IJoin public: MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block); - bool addJoinedBlock(const Block &) override { return false; } - void joinBlock(Block &) override {} + bool addJoinedBlock(const Block & block) override; + void joinBlock(Block &) override; void joinTotals(Block &) const override {} void setTotals(const Block &) override {} - - void joinBlocks(const Block & src_block, Block & dst_block, size_t & src_row); - size_t rightBlocksCount() const { return right_blocks.size(); } - void addRightBlock(const Block & block) { right_blocks.push_back(block); } + size_t getTotalRowCount() const override { return right_blocks_row_count; } private: + mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; Block sample_block_with_columns_to_add; BlocksList right_blocks; + size_t right_blocks_row_count = 0; + size_t right_blocks_bytes = 0; - void join(const Block & left_block, const Block & right_block, Block & dst_block, size_t & src_row); + void addRightColumns(Block & block); + void mergeJoin(Block & block, const Block & right_block); }; } From 8afa48fa42b9eb1f5153a400549e6c76ab91b7e0 Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 10 Sep 2019 21:39:10 +0300 Subject: [PATCH 024/102] some joins code unification --- dbms/src/Interpreters/AnalyzedJoin.cpp | 16 ++++- dbms/src/Interpreters/AnalyzedJoin.h | 1 + dbms/src/Interpreters/IJoin.cpp | 56 ++++++++++++++++++ dbms/src/Interpreters/IJoin.h | 11 ++++ dbms/src/Interpreters/Join.cpp | 81 +++++--------------------- dbms/src/Interpreters/Join.h | 4 +- dbms/src/Interpreters/MergeJoin.cpp | 5 +- dbms/src/Interpreters/MergeJoin.h | 1 + 8 files changed, 103 insertions(+), 72 deletions(-) create mode 100644 dbms/src/Interpreters/IJoin.cpp diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 02e75f3a342..2040044d07c 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -142,6 +142,20 @@ Names AnalyzedJoin::requiredJoinedNames() const return Names(required_columns_set.begin(), required_columns_set.end()); } +std::unordered_map AnalyzedJoin::requiredRightKeys() const +{ + NameSet right_keys; + for (const auto & name : key_names_right) + right_keys.insert(name); + + std::unordered_map required; + for (const auto & column : columns_added_by_join) + if (right_keys.count(column.name)) + required.insert({column.name, column.type}); + + return required; +} + NamesWithAliases AnalyzedJoin::getRequiredColumns(const Block & sample, const Names & action_required_columns) const { NameSet required_columns(action_required_columns.begin(), action_required_columns.end()); @@ -230,7 +244,7 @@ BlockInputStreamPtr AnalyzedJoin::createStreamWithNonJoinedDataIfFullOrRightJoin { if (isRightOrFull(table_join.kind)) if (auto hash_join = typeid_cast(join.get())) - return hash_join->createStreamWithNonJoinedRows(source_header, *this, max_block_size); + return hash_join->createStreamWithNonJoinedRows(source_header, max_block_size); return {}; } diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 960457dddd9..1adcd39a7b2 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -92,6 +92,7 @@ public: void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix); size_t rightKeyInclusion(const String & name) const; + std::unordered_map requiredRightKeys() const; void addJoinedColumn(const NameAndTypePair & joined_column); void addJoinedColumnsAndCorrectNullability(Block & sample_block) const; diff --git a/dbms/src/Interpreters/IJoin.cpp b/dbms/src/Interpreters/IJoin.cpp new file mode 100644 index 00000000000..ed5c9f1935e --- /dev/null +++ b/dbms/src/Interpreters/IJoin.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include + +namespace DB +{ + +ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, + Block & sample_block_with_keys, Block & sample_block_with_columns_to_add) +{ + size_t keys_size = key_names_right.size(); + ColumnRawPtrs key_columns(keys_size); + + sample_block_with_columns_to_add = materializeBlock(right_sample_block); + + for (size_t i = 0; i < keys_size; ++i) + { + const String & column_name = key_names_right[i]; + + /// there could be the same key names + if (sample_block_with_keys.has(column_name)) + { + key_columns[i] = sample_block_with_keys.getByName(column_name).column.get(); + continue; + } + + auto & col = sample_block_with_columns_to_add.getByName(column_name); + col.column = recursiveRemoveLowCardinality(col.column); + col.type = recursiveRemoveLowCardinality(col.type); + + /// Extract right keys with correct keys order. + sample_block_with_keys.insert(col); + sample_block_with_columns_to_add.erase(column_name); + + key_columns[i] = sample_block_with_keys.getColumns().back().get(); + + /// We will join only keys, where all components are not NULL. + if (auto * nullable = checkAndGetColumn(*key_columns[i])) + key_columns[i] = &nullable->getNestedColumn(); + } + + return key_columns; +} + +void createMissedColumns(Block & block) +{ + for (size_t i = 0; i < block.columns(); ++i) + { + auto & column = block.getByPosition(i); + if (!column.column) + column.column = column.type->createColumn(); + } +} + +} diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index 92c23dd4288..d7c6d28d551 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -1,11 +1,16 @@ #pragma once #include +#include + +#include namespace DB { class Block; +class IColumn; +using ColumnRawPtrs = std::vector; class IJoin { @@ -29,4 +34,10 @@ public: using JoinPtr = std::shared_ptr; +/// Common join functions + +ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, + Block & sample_block_with_keys, Block & sample_block_with_columns_to_add); +void createMissedColumns(Block & block); + } diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index cb4d68663ac..2efa1b613cb 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -35,19 +35,6 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -static std::unordered_map requiredRightKeys(const Names & key_names, const NamesAndTypesList & columns_added_by_join) -{ - NameSet right_keys; - for (const auto & name : key_names) - right_keys.insert(name); - - std::unordered_map required; - for (const auto & column : columns_added_by_join) - if (right_keys.count(column.name)) - required.insert({column.name, column.type}); - - return required; -} static void convertColumnToNullable(ColumnWithTypeAndName & column) { @@ -276,36 +263,7 @@ void Join::setSampleBlock(const Block & block) if (!empty()) return; - size_t keys_size = key_names_right.size(); - ColumnRawPtrs key_columns(keys_size); - - sample_block_with_columns_to_add = materializeBlock(block); - - for (size_t i = 0; i < keys_size; ++i) - { - const String & column_name = key_names_right[i]; - - /// there could be the same key names - if (sample_block_with_keys.has(column_name)) - { - key_columns[i] = sample_block_with_keys.getByName(column_name).column.get(); - continue; - } - - auto & col = sample_block_with_columns_to_add.getByName(column_name); - col.column = recursiveRemoveLowCardinality(col.column); - col.type = recursiveRemoveLowCardinality(col.type); - - /// Extract right keys with correct keys order. - sample_block_with_keys.insert(col); - sample_block_with_columns_to_add.erase(column_name); - - key_columns[i] = sample_block_with_keys.getColumns().back().get(); - - /// We will join only keys, where all components are not NULL. - if (auto * nullable = checkAndGetColumn(*key_columns[i])) - key_columns[i] = &nullable->getNestedColumn(); - } + ColumnRawPtrs key_columns = extractKeysForJoin(key_names_right, block, sample_block_with_keys, sample_block_with_columns_to_add); if (strictness == ASTTableJoin::Strictness::Asof) { @@ -344,19 +302,15 @@ void Join::setSampleBlock(const Block & block) blocklist_sample = Block(block.getColumnsWithTypeAndName()); prepareBlockListStructure(blocklist_sample); - size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); - - for (size_t i = 0; i < num_columns_to_add; ++i) - { - auto & column = sample_block_with_columns_to_add.getByPosition(i); - if (!column.column) - column.column = column.type->createColumn(); - } + createMissedColumns(sample_block_with_columns_to_add); /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && isLeftOrFull(kind)) + { + size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); for (size_t i = 0; i < num_columns_to_add; ++i) convertColumnToNullable(sample_block_with_columns_to_add.getByPosition(i)); + } } namespace @@ -784,7 +738,6 @@ template ; @@ -1025,7 +978,7 @@ template void Join::joinGetImpl(Block & block, const String & column_name, const Maps & maps_) const { joinBlockImpl( - block, {block.getByPosition(0).name}, {}, {sample_block_with_columns_to_add.getByName(column_name)}, maps_); + block, {block.getByPosition(0).name}, {sample_block_with_columns_to_add.getByName(column_name)}, maps_); } @@ -1053,7 +1006,6 @@ void Join::joinGet(Block & block, const String & column_name) const void Join::joinBlock(Block & block) { const Names & key_names_left = join_options.keyNamesLeft(); - const NamesAndTypesList & columns_added_by_join = join_options.columnsAddedByJoin(); std::shared_lock lock(rwlock); @@ -1061,7 +1013,7 @@ void Join::joinBlock(Block & block) if (joinDispatch(kind, strictness, maps, [&](auto kind_, auto strictness_, auto & map) { - joinBlockImpl(block, key_names_left, columns_added_by_join, sample_block_with_columns_to_add, map); + joinBlockImpl(block, key_names_left, sample_block_with_columns_to_add, map); })) { /// Joined @@ -1158,11 +1110,13 @@ struct AdderNonJoined class NonJoinedBlockInputStream : public IBlockInputStream { public: - NonJoinedBlockInputStream(const Join & parent_, const Block & left_sample_block, const Names & key_names_left, - const NamesAndTypesList & columns_added_by_join, UInt64 max_block_size_) + NonJoinedBlockInputStream(const Join & parent_, const Block & left_sample_block, UInt64 max_block_size_) : parent(parent_) , max_block_size(max_block_size_) { + const Names & key_names_left = parent_.join_options.keyNamesLeft(); + std::unordered_map required_right_keys = parent_.join_options.requiredRightKeys(); + /** left_sample_block contains keys and "left" columns. * result_sample_block - keys, "left" columns, and "right" columns. */ @@ -1181,7 +1135,7 @@ public: const Block & right_sample_block = parent.sample_block_with_columns_to_add; std::unordered_map left_to_right_key_map; - makeResultSampleBlock(left_sample_block, right_sample_block, columns_added_by_join, + makeResultSampleBlock(left_sample_block, right_sample_block, required_right_keys, key_positions_left, left_to_right_key_map); auto nullability_changes = getNullabilityChanges(parent.sample_block_with_keys, result_sample_block, @@ -1250,7 +1204,7 @@ private: void makeResultSampleBlock(const Block & left_sample_block, const Block & right_sample_block, - const NamesAndTypesList & columns_added_by_join, + const std::unordered_map & right_keys, const std::vector & key_positions_left, std::unordered_map & left_to_right_key_map) { @@ -1270,7 +1224,6 @@ private: } const auto & key_names_right = parent.key_names_right; - auto right_keys = requiredRightKeys(key_names_right, columns_added_by_join); /// Add join key columns from right block if they has different name. for (size_t i = 0; i < key_names_right.size(); ++i) @@ -1462,11 +1415,9 @@ private: }; -BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params, - UInt64 max_block_size) const +BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, UInt64 max_block_size) const { - return std::make_shared(*this, left_sample_block, - join_params.keyNamesLeft(), join_params.columnsAddedByJoin(), max_block_size); + return std::make_shared(*this, left_sample_block, max_block_size); } diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index ec3fc7b045a..108d7312e31 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -158,8 +158,7 @@ public: * Use only after all calls to joinBlock was done. * left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside). */ - BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params, - UInt64 max_block_size) const; + BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, UInt64 max_block_size) const; /// Number of keys in all built JOIN maps. size_t getTotalRowCount() const override; @@ -346,7 +345,6 @@ private: void joinBlockImpl( Block & block, const Names & key_names_left, - const NamesAndTypesList & columns_added_by_join, const Block & block_with_columns_to_add, const Maps & maps) const; diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 4791b639d3f..7cb2c0a66dd 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -14,10 +14,9 @@ namespace ErrorCodes MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) : table_join(table_join_) - , sample_block_with_columns_to_add(materializeBlock(right_sample_block)) { - for (auto & column : table_join.columnsAddedByJoin()) - sample_block_with_columns_to_add.getByName(column.name); + extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, sample_block_with_keys, sample_block_with_columns_to_add); + createMissedColumns(sample_block_with_columns_to_add); } /// TODO: sort diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 1b5d45f3f5c..6f0f2b68c99 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -26,6 +26,7 @@ public: private: mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; + Block sample_block_with_keys; Block sample_block_with_columns_to_add; BlocksList right_blocks; size_t right_blocks_row_count = 0; From 73dafaa22347933f96b8c33db932f924b75bc67e Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 11 Sep 2019 18:57:09 +0300 Subject: [PATCH 025/102] better required-right-keys logic --- dbms/src/Interpreters/AnalyzedJoin.cpp | 14 +++--- dbms/src/Interpreters/AnalyzedJoin.h | 2 +- dbms/src/Interpreters/Join.cpp | 61 +++++++++++--------------- dbms/src/Interpreters/Join.h | 8 ++-- dbms/src/Storages/StorageJoin.cpp | 4 +- 5 files changed, 39 insertions(+), 50 deletions(-) diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 2040044d07c..37c8ee82656 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -142,17 +142,13 @@ Names AnalyzedJoin::requiredJoinedNames() const return Names(required_columns_set.begin(), required_columns_set.end()); } -std::unordered_map AnalyzedJoin::requiredRightKeys() const +NameSet AnalyzedJoin::requiredRightKeys() const { - NameSet right_keys; + NameSet required; for (const auto & name : key_names_right) - right_keys.insert(name); - - std::unordered_map required; - for (const auto & column : columns_added_by_join) - if (right_keys.count(column.name)) - required.insert({column.name, column.type}); - + for (const auto & column : columns_added_by_join) + if (name == column.name) + required.insert(name); return required; } diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 1adcd39a7b2..cc6a9c6aa08 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -92,7 +92,7 @@ public: void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix); size_t rightKeyInclusion(const String & name) const; - std::unordered_map requiredRightKeys() const; + NameSet requiredRightKeys() const; void addJoinedColumn(const NameAndTypePair & joined_column); void addJoinedColumnsAndCorrectNullability(Block & sample_block) const; diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 2efa1b613cb..a0607837e12 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -75,6 +75,7 @@ Join::Join(const AnalyzedJoin & join_options_, const Block & right_sample_block, , kind(join_options_.kind()) , strictness(join_options_.strictness()) , key_names_right(join_options_.keyNamesRight()) + , required_right_keys(join_options_.requiredRightKeys()) , use_nulls(join_options_.joinUseNulls()) , any_take_last_row(any_take_last_row_) , log(&Logger::get("Join")) @@ -263,7 +264,7 @@ void Join::setSampleBlock(const Block & block) if (!empty()) return; - ColumnRawPtrs key_columns = extractKeysForJoin(key_names_right, block, sample_block_with_keys, sample_block_with_columns_to_add); + ColumnRawPtrs key_columns = extractKeysForJoin(key_names_right, block, right_table_keys, sample_block_with_columns_to_add); if (strictness == ASTTableJoin::Strictness::Asof) { @@ -783,7 +784,7 @@ void Join::joinBlockImpl( */ ColumnsWithTypeAndName extras; if constexpr (STRICTNESS == ASTTableJoin::Strictness::Asof) - extras.push_back(sample_block_with_keys.getByName(key_names_right.back())); + extras.push_back(right_table_keys.getByName(key_names_right.back())); AddedColumns added(sample_block_with_columns_to_add, block_with_columns_to_add, block, blocklist_sample, extras); std::unique_ptr offsets_to_replicate; @@ -795,8 +796,6 @@ void Join::joinBlockImpl( block.insert(added.moveColumn(i)); /// Filter & insert missing rows - auto right_keys = join_options.requiredRightKeys(); - constexpr bool is_all_join = STRICTNESS == ASTTableJoin::Strictness::All; constexpr bool inner_or_right = static_in_v; constexpr bool left_or_full = static_in_v; @@ -810,17 +809,16 @@ void Join::joinBlockImpl( block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(row_filter, -1); /// Add join key columns from right block if they has different name. - for (size_t i = 0; i < key_names_right.size(); ++i) + for (size_t i = 0; i < right_table_keys.columns(); ++i) { - auto & right_name = key_names_right[i]; + const auto & right_key = right_table_keys.getByPosition(i); auto & left_name = key_names_left[i]; - auto it = right_keys.find(right_name); - if (it != right_keys.end() && !block.has(right_name)) + if (required_right_keys.count(right_key.name) && !block.has(right_key.name)) { const auto & col = block.getByName(left_name); - bool is_nullable = it->second->isNullable(); - block.insert(correctNullability({col.column, col.type, right_name}, is_nullable)); + bool is_nullable = (use_nulls && left_or_full) || right_key.type->isNullable(); + block.insert(correctNullability({col.column, col.type, right_key.name}, is_nullable)); } } } @@ -833,13 +831,12 @@ void Join::joinBlockImpl( const IColumn::Filter & filter = null_map_filter.getData(); /// Add join key columns from right block if they has different name. - for (size_t i = 0; i < key_names_right.size(); ++i) + for (size_t i = 0; i < right_table_keys.columns(); ++i) { - auto & right_name = key_names_right[i]; + const auto & right_key = right_table_keys.getByPosition(i); auto & left_name = key_names_left[i]; - auto it = right_keys.find(right_name); - if (it != right_keys.end() && !block.has(right_name)) + if (required_right_keys.count(right_key.name) && !block.has(right_key.name)) { const auto & col = block.getByName(left_name); ColumnPtr column = col.column->convertToFullColumnIfConst(); @@ -854,11 +851,11 @@ void Join::joinBlockImpl( mut_column->insertDefault(); } - bool is_nullable = (use_nulls && left_or_full) || it->second->isNullable(); - block.insert(correctNullability({std::move(mut_column), col.type, right_name}, is_nullable, null_map_filter)); + bool is_nullable = (use_nulls && left_or_full) || right_key.type->isNullable(); + block.insert(correctNullability({std::move(mut_column), col.type, right_key.name}, is_nullable, null_map_filter)); if constexpr (is_all_join) - right_keys_to_replicate.push_back(block.getPositionByName(right_name)); + right_keys_to_replicate.push_back(block.getPositionByName(right_key.name)); } } } @@ -992,7 +989,7 @@ void Join::joinGet(Block & block, const String & column_name) const if (key_names_right.size() != 1) throw Exception("joinGet only supports StorageJoin containing exactly one key", ErrorCodes::LOGICAL_ERROR); - checkTypeOfKey(block, sample_block_with_keys); + checkTypeOfKey(block, right_table_keys); if (kind == ASTTableJoin::Kind::Left && strictness == ASTTableJoin::Strictness::Any) { @@ -1009,7 +1006,7 @@ void Join::joinBlock(Block & block) std::shared_lock lock(rwlock); - checkTypesOfKeys(block, key_names_left, sample_block_with_keys); + checkTypesOfKeys(block, key_names_left, right_table_keys); if (joinDispatch(kind, strictness, maps, [&](auto kind_, auto strictness_, auto & map) { @@ -1115,7 +1112,6 @@ public: , max_block_size(max_block_size_) { const Names & key_names_left = parent_.join_options.keyNamesLeft(); - std::unordered_map required_right_keys = parent_.join_options.requiredRightKeys(); /** left_sample_block contains keys and "left" columns. * result_sample_block - keys, "left" columns, and "right" columns. @@ -1135,10 +1131,9 @@ public: const Block & right_sample_block = parent.sample_block_with_columns_to_add; std::unordered_map left_to_right_key_map; - makeResultSampleBlock(left_sample_block, right_sample_block, required_right_keys, - key_positions_left, left_to_right_key_map); + makeResultSampleBlock(left_sample_block, right_sample_block, key_positions_left, left_to_right_key_map); - auto nullability_changes = getNullabilityChanges(parent.sample_block_with_keys, result_sample_block, + auto nullability_changes = getNullabilityChanges(parent.right_table_keys, result_sample_block, key_positions_left, left_to_right_key_map); column_indices_left.reserve(left_sample_block.columns() - key_names_left.size()); @@ -1204,7 +1199,6 @@ private: void makeResultSampleBlock(const Block & left_sample_block, const Block & right_sample_block, - const std::unordered_map & right_keys, const std::vector & key_positions_left, std::unordered_map & left_to_right_key_map) { @@ -1223,22 +1217,19 @@ private: result_sample_block.insert(src_column.cloneEmpty()); } - const auto & key_names_right = parent.key_names_right; - /// Add join key columns from right block if they has different name. - for (size_t i = 0; i < key_names_right.size(); ++i) + for (size_t i = 0; i < parent.right_table_keys.columns(); ++i) { - auto & right_name = key_names_right[i]; + const auto & right_key = parent.right_table_keys.getByPosition(i); size_t left_key_pos = key_positions_left[i]; - auto it = right_keys.find(right_name); - if (it != right_keys.end() && !result_sample_block.has(right_name)) + if (parent.required_right_keys.count(right_key.name) && !result_sample_block.has(right_key.name)) { const auto & col = result_sample_block.getByPosition(left_key_pos); - bool is_nullable = (parent.use_nulls && isFull(parent.kind)) || it->second->isNullable(); - result_sample_block.insert(correctNullability({col.column, col.type, right_name}, is_nullable)); + bool is_nullable = (parent.use_nulls && isFull(parent.kind)) || right_key.type->isNullable(); + result_sample_block.insert(correctNullability({col.column, col.type, right_key.name}, is_nullable)); - size_t right_key_pos = result_sample_block.getPositionByName(right_name); + size_t right_key_pos = result_sample_block.getPositionByName(right_key.name); left_to_right_key_map[left_key_pos] = right_key_pos; } } @@ -1372,7 +1363,7 @@ private: } } - static std::unordered_set getNullabilityChanges(const Block & sample_block_with_keys, const Block & out_block, + static std::unordered_set getNullabilityChanges(const Block & right_table_keys, const Block & out_block, const std::vector & key_positions, const std::unordered_map & left_to_right_key_map) { @@ -1387,7 +1378,7 @@ private: key_pos = it->second; const auto & dst = out_block.getByPosition(key_pos).column; - const auto & src = sample_block_with_keys.getByPosition(i).column; + const auto & src = right_table_keys.getByPosition(i).column; if (dst->isNullable() != src->isNullable()) nullability_changes.insert(key_pos); } diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 108d7312e31..1ed446034d4 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -280,8 +280,10 @@ private: ASTTableJoin::Kind kind; ASTTableJoin::Strictness strictness; - /// Names of key columns (columns for equi-JOIN) in "right" table (in the order they appear in USING clause). + /// Names of key columns in right-side table (in the order they appear in ON/USING clause). @note It could contain duplicates. const Names key_names_right; + /// Names right-side table keys that are needed in result (would be attached after joined columns). + const NameSet required_right_keys; /// Substitute NULLs for non-JOINed rows. bool use_nulls; @@ -310,8 +312,8 @@ private: /// Block with columns from the right-side table except key columns. Block sample_block_with_columns_to_add; - /// Block with key columns in the same order they appear in the right-side table. - Block sample_block_with_keys; + /// Block with key columns in the same order they appear in the right-side table (duplicates appear once). + Block right_table_keys; /// Block as it would appear in the BlockList Block blocklist_sample; diff --git a/dbms/src/Storages/StorageJoin.cpp b/dbms/src/Storages/StorageJoin.cpp index 80a8d16c5f6..901dd7700d3 100644 --- a/dbms/src/Storages/StorageJoin.cpp +++ b/dbms/src/Storages/StorageJoin.cpp @@ -209,10 +209,10 @@ public: for (size_t i = 0; i < sample_block.columns(); ++i) { auto & [_, type, name] = sample_block.getByPosition(i); - if (parent.sample_block_with_keys.has(name)) + if (parent.right_table_keys.has(name)) { key_pos = i; - column_with_null[i] = parent.sample_block_with_keys.getByName(name).type->isNullable(); + column_with_null[i] = parent.right_table_keys.getByName(name).type->isNullable(); } else { From a836f0cfd647280057ef90a79ed1620c7d1e5329 Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 11 Sep 2019 19:19:33 +0300 Subject: [PATCH 026/102] fix columns number in MergeJoin --- dbms/src/Interpreters/MergeJoin.cpp | 7 ++++++- dbms/src/Interpreters/MergeJoin.h | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 7cb2c0a66dd..16abc0ee94c 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -14,8 +14,9 @@ namespace ErrorCodes MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) : table_join(table_join_) + , required_right_keys(table_join.requiredRightKeys()) { - extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, sample_block_with_keys, sample_block_with_columns_to_add); + extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, sample_block_with_columns_to_add); createMissedColumns(sample_block_with_columns_to_add); } @@ -47,6 +48,10 @@ void MergeJoin::addRightColumns(Block & block) for (const auto & column : sample_block_with_columns_to_add) block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); + + for (const auto & column : right_table_keys) + if (required_right_keys.count(column.name)) + block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); } void MergeJoin::mergeJoin(Block & /*block*/, const Block & /*right_block*/) diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 6f0f2b68c99..b03fe4c0687 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -26,7 +26,8 @@ public: private: mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; - Block sample_block_with_keys; + const NameSet required_right_keys; + Block right_table_keys; Block sample_block_with_columns_to_add; BlocksList right_blocks; size_t right_blocks_row_count = 0; From fc7ce2753d3ab10c691774ee1d018d8238a35b1c Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 11 Sep 2019 21:03:21 +0300 Subject: [PATCH 027/102] extract more common join functions --- dbms/src/Interpreters/IJoin.cpp | 46 ++++++++++++++++++- dbms/src/Interpreters/IJoin.h | 15 ++++++- dbms/src/Interpreters/Join.cpp | 69 ++++++----------------------- dbms/src/Interpreters/Join.h | 3 -- dbms/src/Interpreters/MergeJoin.cpp | 10 +++-- 5 files changed, 78 insertions(+), 65 deletions(-) diff --git a/dbms/src/Interpreters/IJoin.cpp b/dbms/src/Interpreters/IJoin.cpp index ed5c9f1935e..46497a8ed30 100644 --- a/dbms/src/Interpreters/IJoin.cpp +++ b/dbms/src/Interpreters/IJoin.cpp @@ -1,11 +1,37 @@ #include #include -#include +#include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; +} + + +namespace JoinCommon +{ + +void convertColumnToNullable(ColumnWithTypeAndName & column) +{ + if (column.type->isNullable() || !column.type->canBeInsideNullable()) + return; + + column.type = makeNullable(column.type); + if (column.column) + column.column = makeNullable(column.column); +} + +void convertColumnsToNullable(Block & block, size_t starting_pos) +{ + for (size_t i = starting_pos; i < block.columns(); ++i) + convertColumnToNullable(block.getByPosition(i)); +} + ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, Block & sample_block_with_keys, Block & sample_block_with_columns_to_add) { @@ -43,6 +69,23 @@ ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & ri return key_columns; } +void checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right, const Names & key_names_right) +{ + size_t keys_size = key_names_left.size(); + + for (size_t i = 0; i < keys_size; ++i) + { + DataTypePtr left_type = removeNullable(recursiveRemoveLowCardinality(block_left.getByName(key_names_left[i]).type)); + DataTypePtr right_type = removeNullable(recursiveRemoveLowCardinality(block_right.getByName(key_names_right[i]).type)); + + if (!left_type->equals(*right_type)) + throw Exception("Type mismatch of columns to JOIN by: " + + key_names_left[i] + " " + left_type->getName() + " at left, " + + key_names_right[i] + " " + right_type->getName() + " at right", + ErrorCodes::TYPE_MISMATCH); + } +} + void createMissedColumns(Block & block) { for (size_t i = 0; i < block.columns(); ++i) @@ -54,3 +97,4 @@ void createMissedColumns(Block & block) } } +} diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index d7c6d28d551..42eada1c43e 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -8,6 +8,7 @@ namespace DB { +struct ColumnWithTypeAndName; class Block; class IColumn; using ColumnRawPtrs = std::vector; @@ -34,10 +35,22 @@ public: using JoinPtr = std::shared_ptr; -/// Common join functions +namespace JoinCommon +{ + +void convertColumnToNullable(ColumnWithTypeAndName & column); +void convertColumnsToNullable(Block & block, size_t starting_pos = 0); + +/// Split key and other columns by keys name list ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, Block & sample_block_with_keys, Block & sample_block_with_columns_to_add); + +/// Throw an exception if blocks have different types of key columns. Compare up to Nullability. +void checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right, const Names & key_names_right); + void createMissedColumns(Block & block); } + +} diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index a0607837e12..6b3d9351740 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -36,21 +36,11 @@ namespace ErrorCodes } -static void convertColumnToNullable(ColumnWithTypeAndName & column) -{ - if (column.type->isNullable() || !column.type->canBeInsideNullable()) - return; - - column.type = makeNullable(column.type); - if (column.column) - column.column = makeNullable(column.column); -} - /// Converts column to nullable if needed. No backward convertion. static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, bool nullable) { if (nullable) - convertColumnToNullable(column); + JoinCommon::convertColumnToNullable(column); return std::move(column); } @@ -58,7 +48,7 @@ static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, { if (nullable) { - convertColumnToNullable(column); + JoinCommon::convertColumnToNullable(column); if (column.type->isNullable() && negative_null_map.size()) { MutableColumnPtr mutable_column = (*std::move(column.column)).mutate(); @@ -264,7 +254,7 @@ void Join::setSampleBlock(const Block & block) if (!empty()) return; - ColumnRawPtrs key_columns = extractKeysForJoin(key_names_right, block, right_table_keys, sample_block_with_columns_to_add); + ColumnRawPtrs key_columns = JoinCommon::extractKeysForJoin(key_names_right, block, right_table_keys, sample_block_with_columns_to_add); if (strictness == ASTTableJoin::Strictness::Asof) { @@ -303,15 +293,11 @@ void Join::setSampleBlock(const Block & block) blocklist_sample = Block(block.getColumnsWithTypeAndName()); prepareBlockListStructure(blocklist_sample); - createMissedColumns(sample_block_with_columns_to_add); + JoinCommon::createMissedColumns(sample_block_with_columns_to_add); /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && isLeftOrFull(kind)) - { - size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); - for (size_t i = 0; i < num_columns_to_add; ++i) - convertColumnToNullable(sample_block_with_columns_to_add.getByPosition(i)); - } + JoinCommon::convertColumnsToNullable(sample_block_with_columns_to_add); } namespace @@ -500,12 +486,7 @@ bool Join::addJoinedBlock(const Block & block) /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && isLeftOrFull(kind)) - { - for (size_t i = isFull(kind) ? keys_size : 0; i < size; ++i) - { - convertColumnToNullable(stored_block->getByPosition(i)); - } - } + JoinCommon::convertColumnsToNullable(*stored_block, (isFull(kind) ? keys_size : 0)); if (kind != ASTTableJoin::Kind::Cross) { @@ -769,12 +750,11 @@ void Join::joinBlockImpl( constexpr bool right_or_full = static_in_v; if constexpr (right_or_full) { - for (size_t i = 0; i < existing_columns; ++i) - { + for (size_t i = 0; i < block.columns(); ++i) block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst(); - if (use_nulls) - convertColumnToNullable(block.getByPosition(i)); - } + + if (use_nulls) + JoinCommon::convertColumnsToNullable(block); } /** For LEFT/INNER JOIN, the saved blocks do not contain keys. @@ -925,27 +905,6 @@ void Join::joinBlockImplCross(Block & block) const block = block.cloneWithColumns(std::move(dst_columns)); } - -void Join::checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right) const -{ - size_t keys_size = key_names_left.size(); - - for (size_t i = 0; i < keys_size; ++i) - { - /// Compare up to Nullability. - - DataTypePtr left_type = removeNullable(recursiveRemoveLowCardinality(block_left.getByName(key_names_left[i]).type)); - DataTypePtr right_type = removeNullable(recursiveRemoveLowCardinality(block_right.getByName(key_names_right[i]).type)); - - if (!left_type->equals(*right_type)) - throw Exception("Type mismatch of columns to JOIN by: " - + key_names_left[i] + " " + left_type->getName() + " at left, " - + key_names_right[i] + " " + right_type->getName() + " at right", - ErrorCodes::TYPE_MISMATCH); - } -} - - static void checkTypeOfKey(const Block & block_left, const Block & block_right) { auto & [c1, left_type_origin, left_name] = block_left.safeGetByPosition(0); @@ -1002,11 +961,10 @@ void Join::joinGet(Block & block, const String & column_name) const void Join::joinBlock(Block & block) { - const Names & key_names_left = join_options.keyNamesLeft(); - std::shared_lock lock(rwlock); - checkTypesOfKeys(block, key_names_left, right_table_keys); + const Names & key_names_left = join_options.keyNamesLeft(); + JoinCommon::checkTypesOfKeys(block, key_names_left, right_table_keys, key_names_right); if (joinDispatch(kind, strictness, maps, [&](auto kind_, auto strictness_, auto & map) { @@ -1206,8 +1164,7 @@ private: /// Convert left columns to Nullable if allowed if (parent.use_nulls) - for (size_t i = 0; i < result_sample_block.columns(); ++i) - convertColumnToNullable(result_sample_block.getByPosition(i)); + JoinCommon::convertColumnsToNullable(result_sample_block); /// Add columns from the right-side table to the block. for (size_t i = 0; i < right_sample_block.columns(); ++i) diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 1ed446034d4..fe84dac485f 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -340,9 +340,6 @@ private: */ void prepareBlockListStructure(Block & stored_block); - /// Throw an exception if blocks have different types of key columns. - void checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right) const; - template void joinBlockImpl( Block & block, diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 16abc0ee94c..2d4f582f50e 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -16,8 +16,8 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl : table_join(table_join_) , required_right_keys(table_join.requiredRightKeys()) { - extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, sample_block_with_columns_to_add); - createMissedColumns(sample_block_with_columns_to_add); + JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, sample_block_with_columns_to_add); + JoinCommon::createMissedColumns(sample_block_with_columns_to_add); } /// TODO: sort @@ -34,10 +34,12 @@ bool MergeJoin::addJoinedBlock(const Block & block) void MergeJoin::joinBlock(Block & block) { - addRightColumns(block); - std::shared_lock lock(rwlock); + JoinCommon::checkTypesOfKeys(block, table_join.keyNamesLeft(), right_table_keys, table_join.keyNamesRight()); + + addRightColumns(block); + for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) mergeJoin(block, *it); } From 61c940d5a97a42813427e4db07503eeb92b063b3 Mon Sep 17 00:00:00 2001 From: chertus Date: Thu, 12 Sep 2019 15:24:19 +0300 Subject: [PATCH 028/102] better materialization logic in Join.cpp --- dbms/src/Interpreters/IJoin.cpp | 16 ++++++++++++++++ dbms/src/Interpreters/IJoin.h | 2 ++ dbms/src/Interpreters/Join.cpp | 31 ++++--------------------------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/dbms/src/Interpreters/IJoin.cpp b/dbms/src/Interpreters/IJoin.cpp index 46497a8ed30..84c014efe22 100644 --- a/dbms/src/Interpreters/IJoin.cpp +++ b/dbms/src/Interpreters/IJoin.cpp @@ -32,6 +32,22 @@ void convertColumnsToNullable(Block & block, size_t starting_pos) convertColumnToNullable(block.getByPosition(i)); } +ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & names, Columns & materialized) +{ + ColumnRawPtrs ptrs; + ptrs.reserve(names.size()); + materialized.reserve(names.size()); + + for (auto & column_name : names) + { + const auto & src_column = block.getByName(column_name).column; + materialized.emplace_back(recursiveRemoveLowCardinality(src_column->convertToFullColumnIfConst())); + ptrs.push_back(materialized.back().get()); + } + + return ptrs; +} + ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, Block & sample_block_with_keys, Block & sample_block_with_columns_to_add) { diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index 42eada1c43e..e716a1335bb 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -4,6 +4,7 @@ #include #include +#include namespace DB { @@ -41,6 +42,7 @@ namespace JoinCommon void convertColumnToNullable(ColumnWithTypeAndName & column); void convertColumnsToNullable(Block & block, size_t starting_pos = 0); +ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & names, Columns & materialized); /// Split key and other columns by keys name list ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 6b3d9351740..f064eaf0be1 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -453,19 +453,9 @@ bool Join::addJoinedBlock(const Block & block) if (empty()) throw Exception("Logical error: Join was not initialized", ErrorCodes::LOGICAL_ERROR); - size_t keys_size = key_names_right.size(); - ColumnRawPtrs key_columns(keys_size); - /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; - materialized_columns.reserve(keys_size); - - /// Memoize key columns to work. - for (size_t i = 0; i < keys_size; ++i) - { - materialized_columns.emplace_back(recursiveRemoveLowCardinality(block.getByName(key_names_right[i]).column->convertToFullColumnIfConst())); - key_columns[i] = materialized_columns.back().get(); - } + ColumnRawPtrs key_columns = JoinCommon::temporaryMaterializeColumns(block, key_names_right, materialized_columns); /// We will insert to the map only keys, where all components are not NULL. ConstNullMapPtr null_map{}; @@ -478,15 +468,12 @@ bool Join::addJoinedBlock(const Block & block) prepareBlockListStructure(*stored_block); - size_t size = stored_block->columns(); - /// Rare case, when joined columns are constant. To avoid code bloat, simply materialize them. - for (size_t i = 0; i < size; ++i) - stored_block->safeGetByPosition(i).column = stored_block->safeGetByPosition(i).column->convertToFullColumnIfConst(); + *stored_block = materializeBlock(*stored_block); /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && isLeftOrFull(kind)) - JoinCommon::convertColumnsToNullable(*stored_block, (isFull(kind) ? keys_size : 0)); + JoinCommon::convertColumnsToNullable(*stored_block, (isFull(kind) ? key_names_right.size() : 0)); if (kind != ASTTableJoin::Kind::Cross) { @@ -723,19 +710,9 @@ void Join::joinBlockImpl( const Block & block_with_columns_to_add, const Maps & maps_) const { - size_t keys_size = key_names_left.size(); - ColumnRawPtrs key_columns(keys_size); - /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; - materialized_columns.reserve(keys_size); - - /// Memoize key columns to work with. - for (size_t i = 0; i < keys_size; ++i) - { - materialized_columns.emplace_back(recursiveRemoveLowCardinality(block.getByName(key_names_left[i]).column->convertToFullColumnIfConst())); - key_columns[i] = materialized_columns.back().get(); - } + ColumnRawPtrs key_columns = JoinCommon::temporaryMaterializeColumns(block, key_names_left, materialized_columns); /// Keys with NULL value in any column won't join to anything. ConstNullMapPtr null_map{}; From 441faba0e30333628f63e6c401761c7e58be2858 Mon Sep 17 00:00:00 2001 From: chertus Date: Thu, 12 Sep 2019 15:59:53 +0300 Subject: [PATCH 029/102] materializeBlockInplace function --- dbms/src/DataStreams/materializeBlock.cpp | 6 ++++++ dbms/src/DataStreams/materializeBlock.h | 1 + dbms/src/Interpreters/Join.cpp | 5 ++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/dbms/src/DataStreams/materializeBlock.cpp b/dbms/src/DataStreams/materializeBlock.cpp index 60cd197912f..6b47cb87baa 100644 --- a/dbms/src/DataStreams/materializeBlock.cpp +++ b/dbms/src/DataStreams/materializeBlock.cpp @@ -20,4 +20,10 @@ Block materializeBlock(const Block & block) return res; } +void materializeBlockInplace(Block & block) +{ + for (size_t i = 0; i < block.columns(); ++i) + block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst(); +} + } diff --git a/dbms/src/DataStreams/materializeBlock.h b/dbms/src/DataStreams/materializeBlock.h index 383be5d7c99..5e1499319c1 100644 --- a/dbms/src/DataStreams/materializeBlock.h +++ b/dbms/src/DataStreams/materializeBlock.h @@ -9,5 +9,6 @@ namespace DB /** Converts columns-constants to full columns ("materializes" them). */ Block materializeBlock(const Block & block); +void materializeBlockInplace(Block & block); } diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index f064eaf0be1..8781a95b14e 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -469,7 +469,7 @@ bool Join::addJoinedBlock(const Block & block) prepareBlockListStructure(*stored_block); /// Rare case, when joined columns are constant. To avoid code bloat, simply materialize them. - *stored_block = materializeBlock(*stored_block); + materializeBlockInplace(*stored_block); /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && isLeftOrFull(kind)) @@ -727,8 +727,7 @@ void Join::joinBlockImpl( constexpr bool right_or_full = static_in_v; if constexpr (right_or_full) { - for (size_t i = 0; i < block.columns(); ++i) - block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst(); + materializeBlockInplace(block); if (use_nulls) JoinCommon::convertColumnsToNullable(block); From f7f2cface923aeb44d5f368094df98afeb274523 Mon Sep 17 00:00:00 2001 From: chertus Date: Thu, 12 Sep 2019 17:09:05 +0300 Subject: [PATCH 030/102] better nullability flags for JOINs --- dbms/src/Interpreters/AnalyzedJoin.h | 4 +++- dbms/src/Interpreters/Join.cpp | 22 +++++++++------------- dbms/src/Interpreters/Join.h | 8 ++++---- dbms/src/Interpreters/MergeJoin.cpp | 22 +++++++++++++--------- dbms/src/Interpreters/MergeJoin.h | 4 ++-- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index cc6a9c6aa08..5e4dc2d89cc 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -77,7 +77,9 @@ public: ASTTableJoin::Kind kind() const { return table_join.kind; } ASTTableJoin::Strictness strictness() const { return table_join.strictness; } const SizeLimits & sizeLimits() const { return size_limits; } - bool joinUseNulls() const { return join_use_nulls; } + + bool forceNullabelRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } + bool forceNullabelLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } void addUsingKey(const ASTPtr & ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 8781a95b14e..88863d819c3 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -66,7 +66,8 @@ Join::Join(const AnalyzedJoin & join_options_, const Block & right_sample_block, , strictness(join_options_.strictness()) , key_names_right(join_options_.keyNamesRight()) , required_right_keys(join_options_.requiredRightKeys()) - , use_nulls(join_options_.joinUseNulls()) + , nullable_right_side(join_options_.forceNullabelRight()) + , nullable_left_side(join_options_.forceNullabelLeft()) , any_take_last_row(any_take_last_row_) , log(&Logger::get("Join")) { @@ -295,8 +296,7 @@ void Join::setSampleBlock(const Block & block) JoinCommon::createMissedColumns(sample_block_with_columns_to_add); - /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. - if (use_nulls && isLeftOrFull(kind)) + if (nullable_right_side) JoinCommon::convertColumnsToNullable(sample_block_with_columns_to_add); } @@ -471,8 +471,7 @@ bool Join::addJoinedBlock(const Block & block) /// Rare case, when joined columns are constant. To avoid code bloat, simply materialize them. materializeBlockInplace(*stored_block); - /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. - if (use_nulls && isLeftOrFull(kind)) + if (nullable_right_side) JoinCommon::convertColumnsToNullable(*stored_block, (isFull(kind) ? key_names_right.size() : 0)); if (kind != ASTTableJoin::Kind::Cross) @@ -729,7 +728,7 @@ void Join::joinBlockImpl( { materializeBlockInplace(block); - if (use_nulls) + if (nullable_left_side) JoinCommon::convertColumnsToNullable(block); } @@ -754,7 +753,6 @@ void Join::joinBlockImpl( /// Filter & insert missing rows constexpr bool is_all_join = STRICTNESS == ASTTableJoin::Strictness::All; constexpr bool inner_or_right = static_in_v; - constexpr bool left_or_full = static_in_v; std::vector right_keys_to_replicate [[maybe_unused]]; @@ -773,7 +771,7 @@ void Join::joinBlockImpl( if (required_right_keys.count(right_key.name) && !block.has(right_key.name)) { const auto & col = block.getByName(left_name); - bool is_nullable = (use_nulls && left_or_full) || right_key.type->isNullable(); + bool is_nullable = nullable_right_side || right_key.type->isNullable(); block.insert(correctNullability({col.column, col.type, right_key.name}, is_nullable)); } } @@ -807,7 +805,7 @@ void Join::joinBlockImpl( mut_column->insertDefault(); } - bool is_nullable = (use_nulls && left_or_full) || right_key.type->isNullable(); + bool is_nullable = nullable_right_side || right_key.type->isNullable(); block.insert(correctNullability({std::move(mut_column), col.type, right_key.name}, is_nullable, null_map_filter)); if constexpr (is_all_join) @@ -1137,9 +1135,7 @@ private: std::unordered_map & left_to_right_key_map) { result_sample_block = materializeBlock(left_sample_block); - - /// Convert left columns to Nullable if allowed - if (parent.use_nulls) + if (parent.nullable_left_side) JoinCommon::convertColumnsToNullable(result_sample_block); /// Add columns from the right-side table to the block. @@ -1159,7 +1155,7 @@ private: if (parent.required_right_keys.count(right_key.name) && !result_sample_block.has(right_key.name)) { const auto & col = result_sample_block.getByPosition(left_key_pos); - bool is_nullable = (parent.use_nulls && isFull(parent.kind)) || right_key.type->isNullable(); + bool is_nullable = (parent.nullable_right_side && isFull(parent.kind)) || right_key.type->isNullable(); result_sample_block.insert(correctNullability({col.column, col.type, right_key.name}, is_nullable)); size_t right_key_pos = result_sample_block.getPositionByName(right_key.name); diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index fe84dac485f..3043d253460 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -128,8 +128,6 @@ public: bool empty() { return type == Type::EMPTY; } - bool isNullUsedAsDefault() const { return use_nulls; } - /** Add block of data from right hand of JOIN to the map. * Returns false, if some limit was exceeded and you should not insert more data. */ @@ -285,8 +283,10 @@ private: /// Names right-side table keys that are needed in result (would be attached after joined columns). const NameSet required_right_keys; - /// Substitute NULLs for non-JOINed rows. - bool use_nulls; + /// In case of LEFT and FULL joins, if use_nulls, convert right-side columns to Nullable. + bool nullable_right_side; + /// In case of RIGHT and FULL joins, if use_nulls, convert left-side columns to Nullable. + bool nullable_left_side; /// Overwrite existing values when encountering the same key again bool any_take_last_row; diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 2d4f582f50e..37c4cf9379d 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -14,10 +14,19 @@ namespace ErrorCodes MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) : table_join(table_join_) - , required_right_keys(table_join.requiredRightKeys()) + , nullable_right_side(table_join_.forceNullabelRight()) { - JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, sample_block_with_columns_to_add); - JoinCommon::createMissedColumns(sample_block_with_columns_to_add); + JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add); + + const NameSet required_right_keys = table_join.requiredRightKeys(); + for (const auto & column : right_table_keys) + if (required_right_keys.count(column.name)) + right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name}); + + JoinCommon::createMissedColumns(right_columns_to_add); + + if (nullable_right_side) + JoinCommon::convertColumnsToNullable(right_columns_to_add); } /// TODO: sort @@ -47,13 +56,8 @@ void MergeJoin::joinBlock(Block & block) void MergeJoin::addRightColumns(Block & block) { size_t rows = block.rows(); - - for (const auto & column : sample_block_with_columns_to_add) + for (const auto & column : right_columns_to_add) block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); - - for (const auto & column : right_table_keys) - if (required_right_keys.count(column.name)) - block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); } void MergeJoin::mergeJoin(Block & /*block*/, const Block & /*right_block*/) diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index b03fe4c0687..6a36e53fae2 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -26,10 +26,10 @@ public: private: mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; - const NameSet required_right_keys; Block right_table_keys; - Block sample_block_with_columns_to_add; + Block right_columns_to_add; BlocksList right_blocks; + bool nullable_right_side; size_t right_blocks_row_count = 0; size_t right_blocks_bytes = 0; From bb5287841fe7f2b5738e8b4d42ab1c3e3d1b120a Mon Sep 17 00:00:00 2001 From: chertus Date: Thu, 12 Sep 2019 21:06:25 +0300 Subject: [PATCH 031/102] make things wrong 2 --- dbms/src/Core/Settings.h | 2 +- dbms/src/Interpreters/AnalyzedJoin.cpp | 4 +- dbms/src/Interpreters/AnalyzedJoin.h | 4 +- dbms/src/Interpreters/MergeJoin.cpp | 50 +++++++++++++++++-- dbms/src/Interpreters/MergeJoin.h | 7 ++- .../Transforms/CreatingSetsTransform.cpp | 3 +- 6 files changed, 58 insertions(+), 12 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 7798a5b062b..112303a9fa5 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -287,7 +287,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, max_bytes_in_join, 0, "Maximum size of the hash table for JOIN (in number of bytes in memory).") \ M(SettingOverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.") \ M(SettingBool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.") \ - M(SettingBool, prefer_merge_join, false, "Use merge join algorithm instead of hash join if possible.") \ + M(SettingBool, partial_merge_join, false, "Use partial merge join instead of hash join if possible.") \ \ M(SettingUInt64, max_rows_to_transfer, 0, "Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ M(SettingUInt64, max_bytes_to_transfer, 0, "Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 37c8ee82656..453b9665c4c 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes AnalyzedJoin::AnalyzedJoin(const Settings & settings) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , join_use_nulls(settings.join_use_nulls) - , prefer_merge_join(settings.prefer_merge_join) + , partial_merge_join(settings.partial_merge_join) {} void AnalyzedJoin::addUsingKey(const ASTPtr & ast) @@ -246,7 +246,7 @@ BlockInputStreamPtr AnalyzedJoin::createStreamWithNonJoinedDataIfFullOrRightJoin JoinPtr AnalyzedJoin::makeJoin(const Block & right_sample_block) const { - if (prefer_merge_join) + if (partial_merge_join) return std::make_shared(*this, right_sample_block); return std::make_shared(*this, right_sample_block); } diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 5e4dc2d89cc..f1957065bff 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -39,7 +39,7 @@ class AnalyzedJoin const SizeLimits size_limits; const bool join_use_nulls; - const bool prefer_merge_join; + const bool partial_merge_join; Names key_names_left; Names key_names_right; /// Duplicating names are qualified. @@ -67,7 +67,7 @@ public: const Names & key_names_right_) : size_limits(limits) , join_use_nulls(use_nulls) - , prefer_merge_join(false) + , partial_merge_join(false) , key_names_right(key_names_right_) { table_join.kind = kind; diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 37c4cf9379d..edf83dc72f1 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -1,7 +1,9 @@ #include #include #include +#include #include +#include namespace DB { @@ -20,18 +22,40 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl const NameSet required_right_keys = table_join.requiredRightKeys(); for (const auto & column : right_table_keys) + { if (required_right_keys.count(column.name)) right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name}); + right_sort_description.emplace_back(SortColumnDescription(column.name, 1, 1)); + } + JoinCommon::createMissedColumns(right_columns_to_add); if (nullable_right_side) JoinCommon::convertColumnsToNullable(right_columns_to_add); + + NameSet unique_left_keys; + for (auto & key_name : table_join.keyNamesLeft()) + { + if (!unique_left_keys.count(key_name)) + { + unique_left_keys.insert(key_name); + left_sort_description.emplace_back(SortColumnDescription(key_name, 1, 1)); + } + } } -/// TODO: sort -bool MergeJoin::addJoinedBlock(const Block & block) +void MergeJoin::setTotals(const Block & totals_block) { + totals = totals_block; + mergeRightBlocks(); +} + +bool MergeJoin::addJoinedBlock(const Block & src_block) +{ + Block block = src_block; + sortBlock(block, right_sort_description); + std::unique_lock lock(rwlock); right_blocks.push_back(block); @@ -43,9 +67,10 @@ bool MergeJoin::addJoinedBlock(const Block & block) void MergeJoin::joinBlock(Block & block) { - std::shared_lock lock(rwlock); - JoinCommon::checkTypesOfKeys(block, table_join.keyNamesLeft(), right_table_keys, table_join.keyNamesRight()); + sortBlock(block, left_sort_description); + + std::shared_lock lock(rwlock); addRightColumns(block); @@ -60,6 +85,23 @@ void MergeJoin::addRightColumns(Block & block) block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); } +void MergeJoin::mergeRightBlocks() +{ + const size_t max_merged_block_size = 128 * 1024 * 1024; + + Blocks unsorted_blocks; + unsorted_blocks.reserve(right_blocks.size()); + for (const auto & block : right_blocks) + unsorted_blocks.push_back(block); + + /// FIXME: there should be no splitted keys by blocks + MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_merged_block_size); + + right_blocks.clear(); + while (Block block = stream.read()) + right_blocks.push_back(block); +} + void MergeJoin::mergeJoin(Block & /*block*/, const Block & /*right_block*/) { /// TODO diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 6a36e53fae2..07fbe5a0bae 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -4,6 +4,7 @@ #include #include +#include #include @@ -20,20 +21,24 @@ public: bool addJoinedBlock(const Block & block) override; void joinBlock(Block &) override; void joinTotals(Block &) const override {} - void setTotals(const Block &) override {} + void setTotals(const Block &) override; size_t getTotalRowCount() const override { return right_blocks_row_count; } private: mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; + SortDescription right_sort_description; + SortDescription left_sort_description; Block right_table_keys; Block right_columns_to_add; BlocksList right_blocks; + Block totals; bool nullable_right_side; size_t right_blocks_row_count = 0; size_t right_blocks_bytes = 0; void addRightColumns(Block & block); + void mergeRightBlocks(); void mergeJoin(Block & block, const Block & right_block); }; diff --git a/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp b/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp index b6791d83723..c69f5f42e1c 100644 --- a/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp +++ b/dbms/src/Processors/Transforms/CreatingSetsTransform.cpp @@ -82,8 +82,7 @@ void CreatingSetsTransform::finishSubquery(SubqueryForSet & subquery) head_rows = profile_info.rows; - if (subquery.join) - subquery.join->setTotals(subquery.source->getTotals()); + subquery.setTotals(); if (head_rows != 0) { From bd957168d206a3dfa0293eccaed2dfc4ef8367ec Mon Sep 17 00:00:00 2001 From: chertus Date: Fri, 13 Sep 2019 19:17:37 +0300 Subject: [PATCH 032/102] any left join (without use_nulls) --- dbms/src/Interpreters/MergeJoin.cpp | 237 ++++++++++++++++++---- dbms/src/Interpreters/MergeJoin.h | 15 +- dbms/src/Parsers/ASTTablesInSelectQuery.h | 3 + 3 files changed, 217 insertions(+), 38 deletions(-) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index edf83dc72f1..55a33a350fe 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -11,6 +12,107 @@ namespace DB namespace ErrorCodes { extern const int SET_SIZE_LIMIT_EXCEEDED; + extern const int NOT_IMPLEMENTED; +} + +struct MergeJoinEqualRange +{ + size_t left_start = 0; + size_t right_start = 0; + size_t left_length = 0; + size_t right_length = 0; + + bool empty() const { return !left_length && !right_length; } +}; + +using Range = MergeJoinEqualRange; + + +class MergeJoinCursor +{ +public: + MergeJoinCursor(const Block & block, const SortDescription & desc_) + : impl(SortCursorImpl(block, desc_)) + {} + + size_t position() const { return impl.pos; } + bool atEnd() const { return impl.pos >= impl.rows; } + void nextN(size_t num) { impl.pos += num; } + + int compareAt(const MergeJoinCursor & rhs, size_t lhs_pos, size_t rhs_pos) const + { + int res = 0; + for (size_t i = 0; i < impl.sort_columns_size; ++i) + { + res = impl.sort_columns[i]->compareAt(lhs_pos, rhs_pos, *(rhs.impl.sort_columns[i]), 1); + if (res) + break; + } + return res; + } + + bool sameNext(size_t lhs_pos) const + { + if (impl.isLast()) + return false; + + for (size_t i = 0; i < impl.sort_columns_size; ++i) + if (impl.sort_columns[i]->compareAt(lhs_pos, lhs_pos + 1, *(impl.sort_columns[i]), 1) != 0) + return false; + return true; + } + + size_t getEqualLength() + { + if (atEnd()) + return 0; + + size_t pos = impl.pos; + for (; pos < impl.rows; ++pos) + if (!sameNext(pos)) + break; + + return pos - impl.pos + 1; + } + + Range getNextEqualRange(MergeJoinCursor & rhs) + { + while (!atEnd() && !rhs.atEnd()) + { + int cmp = compareAt(rhs, impl.pos, rhs.impl.pos); + if (cmp < 0) + impl.next(); + if (cmp > 0) + rhs.impl.next(); + if (!cmp) + { + Range range{impl.pos, rhs.impl.pos, 0, 0}; + range.left_length = getEqualLength(); + range.right_length = rhs.getEqualLength(); + return range; + } + } + + return Range{impl.pos, rhs.impl.pos, 0, 0}; + } + +private: + SortCursorImpl impl; +}; + +static void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDescription & merge) +{ + NameSet unique_keys; + for (auto & key_name : keys) + { + merge.emplace_back(SortColumnDescription(key_name, 1, 1)); + + if (!unique_keys.count(key_name)) + { + unique_keys.insert(key_name); + sort.emplace_back(SortColumnDescription(key_name, 1, 1)); + } + } } @@ -18,31 +120,27 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl : table_join(table_join_) , nullable_right_side(table_join_.forceNullabelRight()) { + if (!isLeft(table_join.kind()) && !isInner(table_join.kind())) + throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); +#if 0 + if (table_join.strictness() != ASTTableJoin::Strictness::Any) + throw Exception("Partial merge supported for ANY JOIN variant only", ErrorCodes::NOT_IMPLEMENTED); +#endif + JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add); const NameSet required_right_keys = table_join.requiredRightKeys(); for (const auto & column : right_table_keys) - { if (required_right_keys.count(column.name)) right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name}); - right_sort_description.emplace_back(SortColumnDescription(column.name, 1, 1)); - } - JoinCommon::createMissedColumns(right_columns_to_add); if (nullable_right_side) JoinCommon::convertColumnsToNullable(right_columns_to_add); - NameSet unique_left_keys; - for (auto & key_name : table_join.keyNamesLeft()) - { - if (!unique_left_keys.count(key_name)) - { - unique_left_keys.insert(key_name); - left_sort_description.emplace_back(SortColumnDescription(key_name, 1, 1)); - } - } + makeSortAndMerge(table_join.keyNamesLeft(), left_sort_description, left_merge_description); + makeSortAndMerge(table_join.keyNamesRight(), right_sort_description, right_merge_description); } void MergeJoin::setTotals(const Block & totals_block) @@ -51,6 +149,23 @@ void MergeJoin::setTotals(const Block & totals_block) mergeRightBlocks(); } +void MergeJoin::mergeRightBlocks() +{ + const size_t max_merged_block_size = 128 * 1024 * 1024; + + Blocks unsorted_blocks; + unsorted_blocks.reserve(right_blocks.size()); + for (const auto & block : right_blocks) + unsorted_blocks.push_back(block); + + /// TODO: there should be no splitted keys by blocks for RIGHT|FULL JOIN + MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_merged_block_size); + + right_blocks.clear(); + while (Block block = stream.read()) + right_blocks.push_back(block); +} + bool MergeJoin::addJoinedBlock(const Block & src_block) { Block block = src_block; @@ -72,39 +187,91 @@ void MergeJoin::joinBlock(Block & block) std::shared_lock lock(rwlock); - addRightColumns(block); + if (isLeft(table_join.kind())) + { + MutableColumns right_columns = makeRightColumns(0); - for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) - mergeJoin(block, *it); + MergeJoinCursor left_cursor(block, left_merge_description); + for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) + { + if (left_cursor.atEnd()) + break; + leftJoin(left_cursor, *it, right_columns); + } + + appendRightColumns(block, std::move(right_columns)); + } + else if (isInner(table_join.kind())) + { + /// TODO + MutableColumns right_columns = makeRightColumns(block.rows()); + appendRightColumns(block, std::move(right_columns)); + } } -void MergeJoin::addRightColumns(Block & block) +void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & right_block, MutableColumns & right_columns) { - size_t rows = block.rows(); - for (const auto & column : right_columns_to_add) - block.insert(ColumnWithTypeAndName{column.column->cloneResized(rows), column.type, column.name}); + MergeJoinCursor right_cursor(right_block, right_merge_description); + + while (!left_cursor.atEnd() && !right_cursor.atEnd()) + { + size_t left_position = left_cursor.position(); + Range range = left_cursor.getNextEqualRange(right_cursor); + + if (left_position < range.left_start) + appendRightNulls(right_columns, range.left_start - left_position); + + if (range.empty()) + break; + + anyLeftJoinEquals(right_block, right_columns, range); + right_cursor.nextN(range.right_length); + + /// TODO: Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) + //if (!right_cursor.atEnd()) + left_cursor.nextN(range.left_length); + } } -void MergeJoin::mergeRightBlocks() +MutableColumns MergeJoin::makeRightColumns(size_t rows) { - const size_t max_merged_block_size = 128 * 1024 * 1024; + MutableColumns columns; + columns.reserve(right_columns_to_add.columns()); - Blocks unsorted_blocks; - unsorted_blocks.reserve(right_blocks.size()); - for (const auto & block : right_blocks) - unsorted_blocks.push_back(block); - - /// FIXME: there should be no splitted keys by blocks - MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_merged_block_size); - - right_blocks.clear(); - while (Block block = stream.read()) - right_blocks.push_back(block); + for (const auto & src_column : right_columns_to_add) + columns.push_back(src_column.column->cloneResized(rows)); + return columns; } -void MergeJoin::mergeJoin(Block & /*block*/, const Block & /*right_block*/) +void MergeJoin::appendRightColumns(Block & block, MutableColumns && right_columns) { - /// TODO + for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + { + const auto & column = right_columns_to_add.getByPosition(i); + block.insert(ColumnWithTypeAndName{std::move(right_columns[i]), column.type, column.name}); + } +} + +void MergeJoin::appendRightNulls(MutableColumns & right_columns, size_t rows_to_add) +{ + for (auto & column : right_columns) + for (size_t i = 0; i < rows_to_add; ++i) + column->insertDefault(); +} + +void MergeJoin::anyLeftJoinEquals(const Block & right_block, MutableColumns & right_columns, const Range & range) +{ + size_t rows_to_insert = range.left_length; + size_t any_row_position = range.right_start; + + for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + { + const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); + auto & dst_column = right_columns[i]; + + for (size_t row = 0; row < rows_to_insert; ++row) + dst_column->insertFrom(*src_column.column, any_row_position); + } } } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 07fbe5a0bae..55b30f04947 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -12,6 +12,8 @@ namespace DB { class AnalyzedJoin; +class MergeJoinCursor; +struct MergeJoinEqualRange; class MergeJoin : public IJoin { @@ -27,8 +29,10 @@ public: private: mutable std::shared_mutex rwlock; const AnalyzedJoin & table_join; - SortDescription right_sort_description; SortDescription left_sort_description; + SortDescription right_sort_description; + SortDescription left_merge_description; + SortDescription right_merge_description; Block right_table_keys; Block right_columns_to_add; BlocksList right_blocks; @@ -37,9 +41,14 @@ private: size_t right_blocks_row_count = 0; size_t right_blocks_bytes = 0; - void addRightColumns(Block & block); + MutableColumns makeRightColumns(size_t rows); + void appendRightColumns(Block & block, MutableColumns && right_columns); + void mergeRightBlocks(); - void mergeJoin(Block & block, const Block & right_block); + void leftJoin(MergeJoinCursor & left_cursor, const Block & right_block, MutableColumns & right_columns); + + void appendRightNulls(MutableColumns & right_columns, size_t rows_to_add); + void anyLeftJoinEquals(const Block & right_block, MutableColumns & right_columns, const MergeJoinEqualRange & range); }; } diff --git a/dbms/src/Parsers/ASTTablesInSelectQuery.h b/dbms/src/Parsers/ASTTablesInSelectQuery.h index 17882750cf0..9691dee96fa 100644 --- a/dbms/src/Parsers/ASTTablesInSelectQuery.h +++ b/dbms/src/Parsers/ASTTablesInSelectQuery.h @@ -107,6 +107,9 @@ struct ASTTableJoin : public IAST void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; }; +inline bool isLeft(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Left; } +inline bool isRight(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Right; } +inline bool isInner(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Inner; } inline bool isFull(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Full; } inline bool isCross(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Cross; } inline bool isComma(ASTTableJoin::Kind kind) { return kind == ASTTableJoin::Kind::Comma; } From 26b1bc2093005ad91d78904aa4dda955d238c2f8 Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Fri, 13 Sep 2019 19:18:26 +0300 Subject: [PATCH 033/102] all_of setting now works properly --- dbms/programs/performance-test/StopConditionsSet.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/programs/performance-test/StopConditionsSet.cpp b/dbms/programs/performance-test/StopConditionsSet.cpp index 45ae65f3600..5a76e41c90e 100644 --- a/dbms/programs/performance-test/StopConditionsSet.cpp +++ b/dbms/programs/performance-test/StopConditionsSet.cpp @@ -32,8 +32,9 @@ void StopConditionsSet::loadFromConfig(const ConfigurationPtr & stop_conditions_ average_speed_not_changing_for_ms.value = stop_conditions_view->getUInt64(key); else throw Exception("Met unkown stop condition: " + key, ErrorCodes::LOGICAL_ERROR); + + ++initialized_count; } - ++initialized_count; } void StopConditionsSet::reset() From 24d0055dc88d58ab4df735bceeff0dfe13addbcb Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Fri, 13 Sep 2019 19:24:59 +0300 Subject: [PATCH 034/102] Typo --- dbms/programs/performance-test/StopConditionsSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/programs/performance-test/StopConditionsSet.cpp b/dbms/programs/performance-test/StopConditionsSet.cpp index 5a76e41c90e..58d3383e81c 100644 --- a/dbms/programs/performance-test/StopConditionsSet.cpp +++ b/dbms/programs/performance-test/StopConditionsSet.cpp @@ -31,7 +31,7 @@ void StopConditionsSet::loadFromConfig(const ConfigurationPtr & stop_conditions_ else if (key == "average_speed_not_changing_for_ms") average_speed_not_changing_for_ms.value = stop_conditions_view->getUInt64(key); else - throw Exception("Met unkown stop condition: " + key, ErrorCodes::LOGICAL_ERROR); + throw Exception("Met unknown stop condition: " + key, ErrorCodes::LOGICAL_ERROR); ++initialized_count; } From eb3d87032c0e43644235fdce0ba3285615d76a0f Mon Sep 17 00:00:00 2001 From: chertus Date: Fri, 13 Sep 2019 20:23:32 +0300 Subject: [PATCH 035/102] all|any left|inner, not tested --- dbms/src/Interpreters/MergeJoin.cpp | 105 ++++++++++++++++++++++------ dbms/src/Interpreters/MergeJoin.h | 8 ++- 2 files changed, 91 insertions(+), 22 deletions(-) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 55a33a350fe..ea512697ddb 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -122,10 +122,6 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl { if (!isLeft(table_join.kind()) && !isInner(table_join.kind())) throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); -#if 0 - if (table_join.strictness() != ASTTableJoin::Strictness::Any) - throw Exception("Partial merge supported for ANY JOIN variant only", ErrorCodes::NOT_IMPLEMENTED); -#endif JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add); @@ -189,7 +185,7 @@ void MergeJoin::joinBlock(Block & block) if (isLeft(table_join.kind())) { - MutableColumns right_columns = makeRightColumns(0); + MutableColumns right_columns = makeMutableColumns(right_columns_to_add); MergeJoinCursor left_cursor(block, left_merge_description); for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) @@ -203,8 +199,19 @@ void MergeJoin::joinBlock(Block & block) } else if (isInner(table_join.kind())) { - /// TODO - MutableColumns right_columns = makeRightColumns(block.rows()); + MutableColumns left_columns = makeMutableColumns(block); + MutableColumns right_columns = makeMutableColumns(right_columns_to_add); + + MergeJoinCursor left_cursor(block, left_merge_description); + for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) + { + if (left_cursor.atEnd()) + break; + innerJoin(left_cursor, block, *it, left_columns, right_columns); + } + + block.clear(); + appendRightColumns(block, std::move(left_columns)); appendRightColumns(block, std::move(right_columns)); } } @@ -224,7 +231,7 @@ void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & right_bloc if (range.empty()) break; - anyLeftJoinEquals(right_block, right_columns, range); + leftJoinEquals(right_block, right_columns, range); right_cursor.nextN(range.right_length); /// TODO: Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) @@ -233,13 +240,33 @@ void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & right_bloc } } -MutableColumns MergeJoin::makeRightColumns(size_t rows) +void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns) +{ + MergeJoinCursor right_cursor(right_block, right_merge_description); + + while (!left_cursor.atEnd() && !right_cursor.atEnd()) + { + Range range = left_cursor.getNextEqualRange(right_cursor); + if (range.empty()) + break; + + innerJoinEquals(left_block, right_block, left_columns, right_columns, range); + right_cursor.nextN(range.right_length); + + /// TODO: Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) + //if (!right_cursor.atEnd()) + left_cursor.nextN(range.left_length); + } +} + +MutableColumns MergeJoin::makeMutableColumns(const Block & block) { MutableColumns columns; - columns.reserve(right_columns_to_add.columns()); + columns.reserve(block.columns()); - for (const auto & src_column : right_columns_to_add) - columns.push_back(src_column.column->cloneResized(rows)); + for (const auto & src_column : block) + columns.push_back(src_column.column->cloneEmpty()); return columns; } @@ -259,18 +286,56 @@ void MergeJoin::appendRightNulls(MutableColumns & right_columns, size_t rows_to_ column->insertDefault(); } -void MergeJoin::anyLeftJoinEquals(const Block & right_block, MutableColumns & right_columns, const Range & range) +void MergeJoin::leftJoinEquals(const Block & right_block, MutableColumns & right_columns, const Range & range) { - size_t rows_to_insert = range.left_length; - size_t any_row_position = range.right_start; + bool any = table_join.strictness() == ASTTableJoin::Strictness::Any; - for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + size_t left_rows_to_insert = range.left_length; + size_t right_rows_to_insert = any ? 1 : range.right_length; + + size_t row_position = range.right_start; + for (size_t right_row = 0; right_row < right_rows_to_insert; ++right_row, ++row_position) { - const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); - auto & dst_column = right_columns[i]; + for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + { + const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); + auto & dst_column = right_columns[i]; - for (size_t row = 0; row < rows_to_insert; ++row) - dst_column->insertFrom(*src_column.column, any_row_position); + for (size_t left_row = 0; left_row < left_rows_to_insert; ++left_row) + dst_column->insertFrom(*src_column.column, row_position); + } + } +} + +void MergeJoin::innerJoinEquals(const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns, const Range & range) +{ + bool any = table_join.strictness() == ASTTableJoin::Strictness::Any; + + size_t left_rows_to_insert = range.left_length; + size_t right_rows_to_insert = any ? 1 : range.right_length; + + size_t row_position = range.right_start; + for (size_t right_row = 0; right_row < right_rows_to_insert; ++right_row, ++row_position) + { + for (size_t i = 0; i < left_block.columns(); ++i) + { + const auto & src_column = left_block.getByPosition(i); + auto & dst_column = left_columns[i]; + + size_t row_pos = range.left_start; + for (size_t row = 0; row < left_rows_to_insert; ++row, ++row_pos) + dst_column->insertFrom(*src_column.column, row_pos); + } + + for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + { + const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); + auto & dst_column = right_columns[i]; + + for (size_t row = 0; row < left_rows_to_insert; ++row) + dst_column->insertFrom(*src_column.column, row_position); + } } } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 55b30f04947..26423395421 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -41,14 +41,18 @@ private: size_t right_blocks_row_count = 0; size_t right_blocks_bytes = 0; - MutableColumns makeRightColumns(size_t rows); + MutableColumns makeMutableColumns(const Block & block); void appendRightColumns(Block & block, MutableColumns && right_columns); void mergeRightBlocks(); void leftJoin(MergeJoinCursor & left_cursor, const Block & right_block, MutableColumns & right_columns); + void innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns); void appendRightNulls(MutableColumns & right_columns, size_t rows_to_add); - void anyLeftJoinEquals(const Block & right_block, MutableColumns & right_columns, const MergeJoinEqualRange & range); + void leftJoinEquals(const Block & right_block, MutableColumns & right_columns, const MergeJoinEqualRange & range); + void innerJoinEquals(const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns, const MergeJoinEqualRange & range); }; } From cd95e8e5e5dc92ac8b43a94e13ef668efb0181c8 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Fri, 13 Sep 2019 20:49:53 +0300 Subject: [PATCH 036/102] Fix inconsistent behaviour of IN with Enums --- dbms/src/Interpreters/convertFieldToType.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/convertFieldToType.cpp b/dbms/src/Interpreters/convertFieldToType.cpp index c4684689947..89780d077e6 100644 --- a/dbms/src/Interpreters/convertFieldToType.cpp +++ b/dbms/src/Interpreters/convertFieldToType.cpp @@ -181,9 +181,16 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (!which_type.isDateOrDateTime() && !which_type.isUUID() && !which_type.isEnum()) throw Exception{"Logical error: unknown numeric type " + type.getName(), ErrorCodes::LOGICAL_ERROR}; - /// Numeric values for Enums should not be used directly in IN section - if (src.getType() == Field::Types::UInt64 && !which_type.isEnum()) + if (src.getType() == Field::Types::UInt64) + { + if (which_type.isEnum()) + { + /// Convert UInt64 to Enum's value + return dynamic_cast(type).castToValue(src); + } + return src; + } if (src.getType() == Field::Types::String) { From 33e7e1ea930e624d5c3c1c03eede087e7a1c0427 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Fri, 13 Sep 2019 21:25:53 +0300 Subject: [PATCH 037/102] Int64 added. --- dbms/src/Interpreters/convertFieldToType.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/dbms/src/Interpreters/convertFieldToType.cpp b/dbms/src/Interpreters/convertFieldToType.cpp index 89780d077e6..7ecbf497704 100644 --- a/dbms/src/Interpreters/convertFieldToType.cpp +++ b/dbms/src/Interpreters/convertFieldToType.cpp @@ -181,17 +181,15 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (!which_type.isDateOrDateTime() && !which_type.isUUID() && !which_type.isEnum()) throw Exception{"Logical error: unknown numeric type " + type.getName(), ErrorCodes::LOGICAL_ERROR}; - if (src.getType() == Field::Types::UInt64) + if (which_type.isEnum() && (src.getType() == Field::Types::UInt64 || src.getType() == Field::Types::Int64)) { - if (which_type.isEnum()) - { - /// Convert UInt64 to Enum's value - return dynamic_cast(type).castToValue(src); - } - - return src; + /// Convert UInt64 or Int64 to Enum's value + return dynamic_cast(type).castToValue(src); } + if (src.getType() == Field::Types::UInt64) + return src; + if (src.getType() == Field::Types::String) { if (which_type.isDate()) From ea3f80460c17fe1c3f3fb2e1da07efd5439bf764 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Fri, 13 Sep 2019 22:07:22 +0300 Subject: [PATCH 038/102] Tests. --- .../queries/0_stateless/01001_enums_in_in_section.reference | 6 ++++++ .../tests/queries/0_stateless/01001_enums_in_in_section.sql | 5 +++++ 2 files changed, 11 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01001_enums_in_in_section.reference create mode 100644 dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql diff --git a/dbms/tests/queries/0_stateless/01001_enums_in_in_section.reference b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.reference new file mode 100644 index 00000000000..2a8ab4eb584 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.reference @@ -0,0 +1,6 @@ +find me +and me +also me +find me +and me +also me diff --git a/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql new file mode 100644 index 00000000000..be58ea63a97 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql @@ -0,0 +1,5 @@ +DROP TABLE IF EXISTS enums; +CREATE TABLE enums AS VALUES('x Enum8(\'hello\' = 0, \'world\' = 1, \'foo\' = -1), y String', ('hello', 'find me'), (0, 'and me'), (-1, 'also me'), ('world', 'don\'t find me')); +SELECT y FROM enums WHERE x IN (0, -1); +SELECT y FROM enums WHERE x IN ('hello', -1); + From 57eef8432290d5e6934518dfd05ead9605d44196 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Fri, 13 Sep 2019 22:09:00 +0300 Subject: [PATCH 039/102] DROP --- dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql index be58ea63a97..d9932421f3e 100644 --- a/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql +++ b/dbms/tests/queries/0_stateless/01001_enums_in_in_section.sql @@ -2,4 +2,4 @@ DROP TABLE IF EXISTS enums; CREATE TABLE enums AS VALUES('x Enum8(\'hello\' = 0, \'world\' = 1, \'foo\' = -1), y String', ('hello', 'find me'), (0, 'and me'), (-1, 'also me'), ('world', 'don\'t find me')); SELECT y FROM enums WHERE x IN (0, -1); SELECT y FROM enums WHERE x IN ('hello', -1); - +DROP TABLE enums; From 25fdaefcaea129c5659edbb3de4927c1f794afa4 Mon Sep 17 00:00:00 2001 From: proller Date: Sat, 14 Sep 2019 12:11:52 +0300 Subject: [PATCH 040/102] remove gcc-7 --- utils/build/build_debian.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/build/build_debian.sh b/utils/build/build_debian.sh index 56dd3f2234d..e4b3a57b6e7 100755 --- a/utils/build/build_debian.sh +++ b/utils/build/build_debian.sh @@ -6,7 +6,9 @@ # curl https://raw.githubusercontent.com/yandex/ClickHouse/master/utils/build/build_debian.sh | sh # install compiler and libs -sudo apt install -y git bash cmake ninja-build gcc-7 g++-7 libicu-dev libreadline-dev gperf +sudo apt install -y git bash cmake ninja-build libicu-dev libreadline-dev gperf +sudo apt install -y gcc-9 g++-9 ||: +sudo apt install -y gcc-8 g++-8 ||: # for -DUNBUNDLED=1 mode: #sudo apt install -y libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libzstd-dev libre2-dev libsparsehash-dev librdkafka-dev libcapnp-dev libpoco-dev libsparsehash-dev libgoogle-perftools-dev libunwind-dev googletest libcctz-dev @@ -24,7 +26,7 @@ fi # Build! mkdir -p build cd build -cmake .. -DCMAKE_CXX_COMPILER=`which g++-7 g++-8 | head -n1` -DCMAKE_C_COMPILER=`which gcc-7 gcc-8 | head -n1` +cmake .. -DCMAKE_CXX_COMPILER=`which g++-9 g++-8 | head -n1` -DCMAKE_C_COMPILER=`which gcc-9 gcc-8 | head -n1` cmake --build . cd .. From bcffa17b92d01da3d8bd47b00bfe916432a3fa43 Mon Sep 17 00:00:00 2001 From: proller Date: Sat, 14 Sep 2019 12:20:17 +0300 Subject: [PATCH 041/102] better --- utils/build/build_debian.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/build/build_debian.sh b/utils/build/build_debian.sh index e4b3a57b6e7..24156377bbe 100755 --- a/utils/build/build_debian.sh +++ b/utils/build/build_debian.sh @@ -8,7 +8,7 @@ # install compiler and libs sudo apt install -y git bash cmake ninja-build libicu-dev libreadline-dev gperf sudo apt install -y gcc-9 g++-9 ||: -sudo apt install -y gcc-8 g++-8 ||: +[ -z `which g++-9` ] && sudo apt install -y gcc-8 g++-8 ||: # for -DUNBUNDLED=1 mode: #sudo apt install -y libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libzstd-dev libre2-dev libsparsehash-dev librdkafka-dev libcapnp-dev libpoco-dev libsparsehash-dev libgoogle-perftools-dev libunwind-dev googletest libcctz-dev From 98050c108f9d27d198c2343e6bae6a60fb6e95ec Mon Sep 17 00:00:00 2001 From: malkfilipp Date: Sun, 15 Sep 2019 19:07:27 +0300 Subject: [PATCH 042/102] Add LIMIT clause to SHOW queries --- dbms/src/Interpreters/InterpreterShowTablesQuery.cpp | 4 ++++ dbms/src/Parsers/ASTShowTablesQuery.cpp | 8 +++++++- dbms/src/Parsers/ASTShowTablesQuery.h | 1 + dbms/src/Parsers/ParserShowTablesQuery.cpp | 9 +++++++++ dbms/src/Parsers/ParserShowTablesQuery.h | 4 ++-- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/dbms/src/Interpreters/InterpreterShowTablesQuery.cpp b/dbms/src/Interpreters/InterpreterShowTablesQuery.cpp index 774edcc3390..dcfe76adb82 100644 --- a/dbms/src/Interpreters/InterpreterShowTablesQuery.cpp +++ b/dbms/src/Interpreters/InterpreterShowTablesQuery.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -53,6 +54,9 @@ String InterpreterShowTablesQuery::getRewrittenQuery() if (!query.like.empty()) rewritten_query << " AND name " << (query.not_like ? "NOT " : "") << "LIKE " << std::quoted(query.like, '\''); + if (query.limit_length) + rewritten_query << " LIMIT " << query.limit_length; + return rewritten_query.str(); } diff --git a/dbms/src/Parsers/ASTShowTablesQuery.cpp b/dbms/src/Parsers/ASTShowTablesQuery.cpp index dd7b0d013ad..4a33aeba99c 100644 --- a/dbms/src/Parsers/ASTShowTablesQuery.cpp +++ b/dbms/src/Parsers/ASTShowTablesQuery.cpp @@ -13,7 +13,7 @@ ASTPtr ASTShowTablesQuery::clone() const return res; } -void ASTShowTablesQuery::formatQueryImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const +void ASTShowTablesQuery::formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { if (databases) { @@ -30,6 +30,12 @@ void ASTShowTablesQuery::formatQueryImpl(const FormatSettings & settings, Format if (!like.empty()) settings.ostr << (settings.hilite ? hilite_keyword : "") << " LIKE " << (settings.hilite ? hilite_none : "") << std::quoted(like, '\''); + + if (limit_length) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " LIMIT " << (settings.hilite ? hilite_none : ""); + limit_length->formatImpl(settings, state, frame); + } } } diff --git a/dbms/src/Parsers/ASTShowTablesQuery.h b/dbms/src/Parsers/ASTShowTablesQuery.h index 9b994b6e31f..f3500f437c3 100644 --- a/dbms/src/Parsers/ASTShowTablesQuery.h +++ b/dbms/src/Parsers/ASTShowTablesQuery.h @@ -19,6 +19,7 @@ public: String from; String like; bool not_like{false}; + ASTPtr limit_length; /** Get the text that identifies this element. */ String getID(char) const override { return "ShowTables"; } diff --git a/dbms/src/Parsers/ParserShowTablesQuery.cpp b/dbms/src/Parsers/ParserShowTablesQuery.cpp index 00e5dcd451e..3fe43c4557d 100644 --- a/dbms/src/Parsers/ParserShowTablesQuery.cpp +++ b/dbms/src/Parsers/ParserShowTablesQuery.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -22,8 +23,10 @@ bool ParserShowTablesQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ParserKeyword s_from("FROM"); ParserKeyword s_not("NOT"); ParserKeyword s_like("LIKE"); + ParserKeyword s_limit("LIMIT"); ParserStringLiteral like_p; ParserIdentifier name_p; + ParserExpressionWithOptionalAlias limit_p(false); ASTPtr like; ASTPtr database; @@ -60,6 +63,12 @@ bool ParserShowTablesQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec } else if (query->not_like) return false; + + if (s_limit.ignore(pos, expected)) + { + if (!limit_p.parse(pos, query->limit_length, expected)) + return false; + } } else return false; diff --git a/dbms/src/Parsers/ParserShowTablesQuery.h b/dbms/src/Parsers/ParserShowTablesQuery.h index 5d6cbc42391..29b6d56159b 100644 --- a/dbms/src/Parsers/ParserShowTablesQuery.h +++ b/dbms/src/Parsers/ParserShowTablesQuery.h @@ -7,14 +7,14 @@ namespace DB { /** Query like this: - * SHOW TABLES [FROM db] [[NOT] LIKE 'str'] + * SHOW TABLES [FROM db] [[NOT] LIKE 'str'] [LIMIT expr] * or * SHOW DATABASES. */ class ParserShowTablesQuery : public IParserBase { protected: - const char * getName() const { return "SHOW [TEMPORARY] TABLES|DATABASES [[NOT] LIKE 'str']"; } + const char * getName() const { return "SHOW [TEMPORARY] TABLES|DATABASES [[NOT] LIKE 'str'] [LIMIT expr]"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected); }; From ed42b94af7b64bc13ff420b9b2370aff292826fb Mon Sep 17 00:00:00 2001 From: malkfilipp Date: Sun, 15 Sep 2019 19:08:26 +0300 Subject: [PATCH 043/102] Add tests --- .../01011_show_tables_limit.reference | 15 ++++++++++++++ .../0_stateless/01011_show_tables_limit.sql | 20 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01011_show_tables_limit.reference create mode 100644 dbms/tests/queries/0_stateless/01011_show_tables_limit.sql diff --git a/dbms/tests/queries/0_stateless/01011_show_tables_limit.reference b/dbms/tests/queries/0_stateless/01011_show_tables_limit.reference new file mode 100644 index 00000000000..4d33d4016db --- /dev/null +++ b/dbms/tests/queries/0_stateless/01011_show_tables_limit.reference @@ -0,0 +1,15 @@ +*** Should show 6: *** +test1 +test2 +test3 +test4 +test5 +test6 +*** Should show 2: *** +test1 +test2 +*** Should show 4: *** +test1 +test2 +test3 +test4 diff --git a/dbms/tests/queries/0_stateless/01011_show_tables_limit.sql b/dbms/tests/queries/0_stateless/01011_show_tables_limit.sql new file mode 100644 index 00000000000..f75ba88cf2b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01011_show_tables_limit.sql @@ -0,0 +1,20 @@ +DROP DATABASE IF EXISTS test_show_limit; + +CREATE DATABASE test_show_limit; + +CREATE TABLE test_show_limit.test1 (test UInt8) ENGINE = TinyLog; +CREATE TABLE test_show_limit.test2 (test UInt8) ENGINE = TinyLog; +CREATE TABLE test_show_limit.test3 (test UInt8) ENGINE = TinyLog; +CREATE TABLE test_show_limit.test4 (test UInt8) ENGINE = TinyLog; +CREATE TABLE test_show_limit.test5 (test UInt8) ENGINE = TinyLog; +CREATE TABLE test_show_limit.test6 (test UInt8) ENGINE = TinyLog; + +SELECT '*** Should show 6: ***'; +SHOW TABLES FROM test_show_limit; +SELECT '*** Should show 2: ***'; +SHOW TABLES FROM test_show_limit LIMIT 2; +SELECT '*** Should show 4: ***'; +SHOW TABLES FROM test_show_limit LIMIT 2 * 2; + +DROP DATABASE test_show_limit; + From 24e91142880d613ae3d802565b3cc0b711a3a448 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 16 Sep 2019 14:00:00 +0300 Subject: [PATCH 044/102] Add gdb index to builds with debug info --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9513ee85e02..5e20b1fa7ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,6 +156,11 @@ endif () # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") +if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") +endif() + cmake_host_system_information(RESULT AVAILABLE_PHYSICAL_MEMORY QUERY AVAILABLE_PHYSICAL_MEMORY) # Not available under freebsd if(NOT AVAILABLE_PHYSICAL_MEMORY OR AVAILABLE_PHYSICAL_MEMORY GREATER 8000) option(COMPILER_PIPE "-pipe compiler option [less /tmp usage, more ram usage]" ON) From 89fbb11c3921d56b04b3a32e4bbb0271a7598b22 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 16 Sep 2019 14:13:22 +0300 Subject: [PATCH 045/102] Add testname option to stateless and stateful tests --- docker/test/stateful/Dockerfile | 2 +- docker/test/stateful_with_coverage/run.sh | 2 +- docker/test/stateless/Dockerfile | 2 +- docker/test/stateless_with_coverage/run.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/test/stateful/Dockerfile b/docker/test/stateful/Dockerfile index cab4ebbe8bb..4b2c32d0533 100644 --- a/docker/test/stateful/Dockerfile +++ b/docker/test/stateful/Dockerfile @@ -46,4 +46,4 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \ && clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" \ && clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" \ && clickhouse-client --query "SHOW TABLES FROM test" \ - && clickhouse-test --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt + && clickhouse-test --testname --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateful_with_coverage/run.sh b/docker/test/stateful_with_coverage/run.sh index d521632f98a..e5c79e90dce 100755 --- a/docker/test/stateful_with_coverage/run.sh +++ b/docker/test/stateful_with_coverage/run.sh @@ -87,7 +87,7 @@ LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "SHOW TABL LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "SHOW TABLES FROM test" -LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt +LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --testname --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt kill_clickhouse diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 60ab18cd5f2..63080d5d667 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -55,4 +55,4 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \ echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-8/bin/llvm-symbolizer" >> /etc/environment; \ echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \ service zookeeper start; sleep 5; \ - service clickhouse-server start && sleep 5 && clickhouse-test --shard --zookeeper $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt + service clickhouse-server start && sleep 5 && clickhouse-test --testname --shard --zookeeper $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateless_with_coverage/run.sh b/docker/test/stateless_with_coverage/run.sh index 26e230573d5..b25c1747be9 100755 --- a/docker/test/stateless_with_coverage/run.sh +++ b/docker/test/stateless_with_coverage/run.sh @@ -69,7 +69,7 @@ while /bin/true; do sleep 2 done & -LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --shard --zookeeper $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt +LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --testname --shard --zookeeper $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt kill_clickhouse From cbd96af07945712c8210cc58a7769159dba3343a Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 16 Sep 2019 15:37:46 +0300 Subject: [PATCH 046/102] fix use after free (change Join <-> AnalyzedJoin ownership) --- dbms/src/Interpreters/AnalyzedJoin.cpp | 25 ++++++----------- dbms/src/Interpreters/AnalyzedJoin.h | 9 +------ dbms/src/Interpreters/ExpressionActions.cpp | 15 ++++++----- dbms/src/Interpreters/ExpressionActions.h | 7 +++-- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 14 +++++----- dbms/src/Interpreters/ExpressionAnalyzer.h | 6 +++-- dbms/src/Interpreters/IJoin.h | 3 +++ .../Interpreters/InterpreterSelectQuery.cpp | 4 +-- dbms/src/Interpreters/Join.cpp | 27 ++++++++++--------- dbms/src/Interpreters/Join.h | 8 +++--- dbms/src/Interpreters/MergeJoin.cpp | 26 +++++++++--------- dbms/src/Interpreters/MergeJoin.h | 4 +-- dbms/src/Storages/StorageJoin.cpp | 4 +-- 13 files changed, 73 insertions(+), 79 deletions(-) diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 453b9665c4c..5da4a7219d0 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -232,23 +232,7 @@ bool AnalyzedJoin::sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y) && x->table_join.strictness == y->table_join.strictness && x->key_names_left == y->key_names_left && x->key_names_right == y->key_names_right - && x->columns_added_by_join == y->columns_added_by_join - && x->join == y->join; -} - -BlockInputStreamPtr AnalyzedJoin::createStreamWithNonJoinedDataIfFullOrRightJoin(const Block & source_header, UInt64 max_block_size) const -{ - if (isRightOrFull(table_join.kind)) - if (auto hash_join = typeid_cast(join.get())) - return hash_join->createStreamWithNonJoinedRows(source_header, max_block_size); - return {}; -} - -JoinPtr AnalyzedJoin::makeJoin(const Block & right_sample_block) const -{ - if (partial_merge_join) - return std::make_shared(*this, right_sample_block); - return std::make_shared(*this, right_sample_block); + && x->columns_added_by_join == y->columns_added_by_join; } NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context) @@ -276,4 +260,11 @@ NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpressio return names_and_type_list; } +JoinPtr makeJoin(std::shared_ptr table_join, const Block & right_sample_block) +{ + if (table_join->partial_merge_join) + return std::make_shared(table_join, right_sample_block); + return std::make_shared(table_join, right_sample_block); +} + } diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index f1957065bff..b04d55490e4 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -57,8 +57,6 @@ class AnalyzedJoin /// Original name -> name. Only ranamed columns. std::unordered_map renames; - JoinPtr join; - public: AnalyzedJoin(const Settings &); @@ -108,13 +106,8 @@ public: const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; } const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; } - JoinPtr getJoin() const { return join; } - void setJoin(const JoinPtr & join_) { join = join_; } - - JoinPtr makeJoin(const Block & right_sample_block) const; - BlockInputStreamPtr createStreamWithNonJoinedDataIfFullOrRightJoin(const Block & source_header, UInt64 max_block_size) const; - static bool sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y); + friend JoinPtr makeJoin(std::shared_ptr table_join, const Block & right_sample_block); }; struct ASTTableExpression; diff --git a/dbms/src/Interpreters/ExpressionActions.cpp b/dbms/src/Interpreters/ExpressionActions.cpp index 3638fba687d..a37a8cbbe26 100644 --- a/dbms/src/Interpreters/ExpressionActions.cpp +++ b/dbms/src/Interpreters/ExpressionActions.cpp @@ -160,11 +160,12 @@ ExpressionAction ExpressionAction::arrayJoin(const NameSet & array_joined_column return a; } -ExpressionAction ExpressionAction::ordinaryJoin(std::shared_ptr table_join) +ExpressionAction ExpressionAction::ordinaryJoin(std::shared_ptr table_join, JoinPtr join) { ExpressionAction a; a.type = JOIN; a.table_join = table_join; + a.join = join; return a; } @@ -475,7 +476,7 @@ void ExpressionAction::execute(Block & block, bool dry_run) const case JOIN: { - table_join->getJoin()->joinBlock(block); + join->joinBlock(block); break; } @@ -543,7 +544,7 @@ void ExpressionAction::executeOnTotals(Block & block) const if (type != JOIN) execute(block, false); else - table_join->getJoin()->joinTotals(block); + join->joinTotals(block); } @@ -763,7 +764,7 @@ void ExpressionActions::execute(Block & block, bool dry_run) const bool ExpressionActions::hasTotalsInJoin() const { for (const auto & action : actions) - if (action.table_join && action.table_join->getJoin()->hasTotals()) + if (action.table_join && action.join->hasTotals()) return true; return false; } @@ -1157,11 +1158,11 @@ void ExpressionActions::optimizeArrayJoin() } -std::shared_ptr ExpressionActions::getTableJoin() const +JoinPtr ExpressionActions::getTableJoinAlgo() const { for (const auto & action : actions) - if (action.table_join) - return action.table_join; + if (action.join) + return action.join; return {}; } diff --git a/dbms/src/Interpreters/ExpressionActions.h b/dbms/src/Interpreters/ExpressionActions.h index 20acd1a95c8..133e70d1fdb 100644 --- a/dbms/src/Interpreters/ExpressionActions.h +++ b/dbms/src/Interpreters/ExpressionActions.h @@ -21,6 +21,8 @@ namespace ErrorCodes } class AnalyzedJoin; +class IJoin; +using JoinPtr = std::shared_ptr; class IPreparedFunction; using PreparedFunctionPtr = std::shared_ptr; @@ -101,6 +103,7 @@ public: /// For JOIN std::shared_ptr table_join; + JoinPtr join; /// For PROJECT. NamesWithAliases projection; @@ -116,7 +119,7 @@ public: static ExpressionAction project(const Names & projected_columns_); static ExpressionAction addAliases(const NamesWithAliases & aliased_columns_); static ExpressionAction arrayJoin(const NameSet & array_joined_columns, bool array_join_is_left, const Context & context); - static ExpressionAction ordinaryJoin(std::shared_ptr join); + static ExpressionAction ordinaryJoin(std::shared_ptr table_join, JoinPtr join); /// Which columns necessary to perform this action. Names getNeededColumns() const; @@ -232,7 +235,7 @@ public: static std::string getSmallestColumn(const NamesAndTypesList & columns); - std::shared_ptr getTableJoin() const; + JoinPtr getTableJoinAlgo() const; const Settings & getSettings() const { return settings; } diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 2a87cef1152..e4e00375829 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -408,9 +408,9 @@ bool SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActionsChain & cha return true; } -void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions) const +void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, JoinPtr join) const { - actions->add(ExpressionAction::ordinaryJoin(syntax->analyzed_join)); + actions->add(ExpressionAction::ordinaryJoin(syntax->analyzed_join, join)); } bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types) @@ -419,13 +419,13 @@ bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, b if (!ast_join) return false; - makeTableJoin(*ast_join); + JoinPtr table_join = makeTableJoin(*ast_join); initChain(chain, sourceColumns()); ExpressionActionsChain::Step & step = chain.steps.back(); getRootActions(analyzedJoin().leftKeysList(), only_types, step.actions); - addJoinAction(step.actions); + addJoinAction(step.actions, table_join); return true; } @@ -465,7 +465,7 @@ static ExpressionActionsPtr createJoinedBlockActions(const Context & context, co return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); } -void SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) +JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -491,10 +491,10 @@ void SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryEl /// TODO You do not need to set this up when JOIN is only needed on remote servers. subquery_for_join.setJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside - subquery_for_join.join = analyzedJoin().makeJoin(subquery_for_join.sample_block); + subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block); } - syntax->analyzed_join->setJoin(subquery_for_join.join); + return subquery_for_join.join; } void SelectQueryExpressionAnalyzer::makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.h b/dbms/src/Interpreters/ExpressionAnalyzer.h index 33c974e29d9..2dfc8909d02 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.h +++ b/dbms/src/Interpreters/ExpressionAnalyzer.h @@ -20,6 +20,8 @@ class ExpressionActions; using ExpressionActionsPtr = std::shared_ptr; struct ASTTableJoin; +class IJoin; +using JoinPtr = std::shared_ptr; class ASTFunction; class ASTExpressionList; @@ -123,7 +125,7 @@ protected: void addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool is_left) const; - void addJoinAction(ExpressionActionsPtr & actions) const; + void addJoinAction(ExpressionActionsPtr & actions, JoinPtr = {}) const; void getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts = false); @@ -215,7 +217,7 @@ private: */ void tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name); - void makeTableJoin(const ASTTablesInSelectQueryElement & join_element); + JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element); void makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, NamesWithAliases && required_columns_with_aliases, SubqueryForSet & subquery_for_set) const; diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index e716a1335bb..990e88a51f6 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -5,6 +5,7 @@ #include #include +#include namespace DB { @@ -32,6 +33,8 @@ public: virtual void joinTotals(Block & block) const = 0; virtual size_t getTotalRowCount() const = 0; + + virtual BlockInputStreamPtr createStreamWithNonJoinedRows(const Block &, UInt64) const { return {}; } }; using JoinPtr = std::shared_ptr; diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index a795fac596d..22431438781 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -1117,9 +1117,9 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS stream = std::make_shared(stream, expressions.before_join); } - if (auto join = expressions.before_join->getTableJoin()) + if (JoinPtr join = expressions.before_join->getTableJoinAlgo()) { - if (auto stream = join->createStreamWithNonJoinedDataIfFullOrRightJoin(header_before_join, settings.max_block_size)) + if (auto stream = join->createStreamWithNonJoinedRows(header_before_join, settings.max_block_size)) { if constexpr (pipeline_with_processors) { diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 88863d819c3..a0eb8da3ce9 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -60,14 +60,14 @@ static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, } -Join::Join(const AnalyzedJoin & join_options_, const Block & right_sample_block, bool any_take_last_row_) - : join_options(join_options_) - , kind(join_options_.kind()) - , strictness(join_options_.strictness()) - , key_names_right(join_options_.keyNamesRight()) - , required_right_keys(join_options_.requiredRightKeys()) - , nullable_right_side(join_options_.forceNullabelRight()) - , nullable_left_side(join_options_.forceNullabelLeft()) +Join::Join(std::shared_ptr table_join_, const Block & right_sample_block, bool any_take_last_row_) + : table_join(table_join_) + , kind(table_join->kind()) + , strictness(table_join->strictness()) + , key_names_right(table_join->keyNamesRight()) + , required_right_keys(table_join->requiredRightKeys()) + , nullable_right_side(table_join->forceNullabelRight()) + , nullable_left_side(table_join->forceNullabelLeft()) , any_take_last_row(any_take_last_row_) , log(&Logger::get("Join")) { @@ -493,7 +493,7 @@ bool Join::addJoinedBlock(const Block & block) blocks_nullmaps.emplace_back(stored_block, null_map_holder); } - return join_options.sizeLimits().check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); + return table_join->sizeLimits().check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); } @@ -937,7 +937,7 @@ void Join::joinBlock(Block & block) { std::shared_lock lock(rwlock); - const Names & key_names_left = join_options.keyNamesLeft(); + const Names & key_names_left = table_join->keyNamesLeft(); JoinCommon::checkTypesOfKeys(block, key_names_left, right_table_keys, key_names_right); if (joinDispatch(kind, strictness, maps, [&](auto kind_, auto strictness_, auto & map) @@ -1043,7 +1043,7 @@ public: : parent(parent_) , max_block_size(max_block_size_) { - const Names & key_names_left = parent_.join_options.keyNamesLeft(); + const Names & key_names_left = parent_.table_join->keyNamesLeft(); /** left_sample_block contains keys and "left" columns. * result_sample_block - keys, "left" columns, and "right" columns. @@ -1337,8 +1337,9 @@ private: BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, UInt64 max_block_size) const { - return std::make_shared(*this, left_sample_block, max_block_size); + if (isRightOrFull(table_join->kind())) + return std::make_shared(*this, left_sample_block, max_block_size); + return {}; } - } diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 3043d253460..424512266fb 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -124,7 +124,7 @@ using MappedAsof = WithFlags; class Join : public IJoin { public: - Join(const AnalyzedJoin & join_options, const Block & right_sample_block, bool any_take_last_row_ = false); + Join(std::shared_ptr table_join_, const Block & right_sample_block, bool any_take_last_row_ = false); bool empty() { return type == Type::EMPTY; } @@ -156,7 +156,7 @@ public: * Use only after all calls to joinBlock was done. * left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside). */ - BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, UInt64 max_block_size) const; + BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, UInt64 max_block_size) const override; /// Number of keys in all built JOIN maps. size_t getTotalRowCount() const override; @@ -274,12 +274,12 @@ private: friend class NonJoinedBlockInputStream; friend class JoinBlockInputStream; - const AnalyzedJoin & join_options; + std::shared_ptr table_join; ASTTableJoin::Kind kind; ASTTableJoin::Strictness strictness; /// Names of key columns in right-side table (in the order they appear in ON/USING clause). @note It could contain duplicates. - const Names key_names_right; + const Names & key_names_right; /// Names right-side table keys that are needed in result (would be attached after joined columns). const NameSet required_right_keys; diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index ea512697ddb..adb5b4625ee 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -116,16 +116,16 @@ static void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDes } -MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block) +MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & right_sample_block) : table_join(table_join_) - , nullable_right_side(table_join_.forceNullabelRight()) + , nullable_right_side(table_join->forceNullabelRight()) { - if (!isLeft(table_join.kind()) && !isInner(table_join.kind())) + if (!isLeft(table_join->kind()) && !isInner(table_join->kind())) throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); - JoinCommon::extractKeysForJoin(table_join.keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add); + JoinCommon::extractKeysForJoin(table_join->keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add); - const NameSet required_right_keys = table_join.requiredRightKeys(); + const NameSet required_right_keys = table_join->requiredRightKeys(); for (const auto & column : right_table_keys) if (required_right_keys.count(column.name)) right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name}); @@ -135,8 +135,8 @@ MergeJoin::MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sampl if (nullable_right_side) JoinCommon::convertColumnsToNullable(right_columns_to_add); - makeSortAndMerge(table_join.keyNamesLeft(), left_sort_description, left_merge_description); - makeSortAndMerge(table_join.keyNamesRight(), right_sort_description, right_merge_description); + makeSortAndMerge(table_join->keyNamesLeft(), left_sort_description, left_merge_description); + makeSortAndMerge(table_join->keyNamesRight(), right_sort_description, right_merge_description); } void MergeJoin::setTotals(const Block & totals_block) @@ -173,17 +173,17 @@ bool MergeJoin::addJoinedBlock(const Block & src_block) right_blocks_row_count += block.rows(); right_blocks_bytes += block.bytes(); - return table_join.sizeLimits().check(right_blocks_row_count, right_blocks_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); + return table_join->sizeLimits().check(right_blocks_row_count, right_blocks_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); } void MergeJoin::joinBlock(Block & block) { - JoinCommon::checkTypesOfKeys(block, table_join.keyNamesLeft(), right_table_keys, table_join.keyNamesRight()); + JoinCommon::checkTypesOfKeys(block, table_join->keyNamesLeft(), right_table_keys, table_join->keyNamesRight()); sortBlock(block, left_sort_description); std::shared_lock lock(rwlock); - if (isLeft(table_join.kind())) + if (isLeft(table_join->kind())) { MutableColumns right_columns = makeMutableColumns(right_columns_to_add); @@ -197,7 +197,7 @@ void MergeJoin::joinBlock(Block & block) appendRightColumns(block, std::move(right_columns)); } - else if (isInner(table_join.kind())) + else if (isInner(table_join->kind())) { MutableColumns left_columns = makeMutableColumns(block); MutableColumns right_columns = makeMutableColumns(right_columns_to_add); @@ -288,7 +288,7 @@ void MergeJoin::appendRightNulls(MutableColumns & right_columns, size_t rows_to_ void MergeJoin::leftJoinEquals(const Block & right_block, MutableColumns & right_columns, const Range & range) { - bool any = table_join.strictness() == ASTTableJoin::Strictness::Any; + bool any = table_join->strictness() == ASTTableJoin::Strictness::Any; size_t left_rows_to_insert = range.left_length; size_t right_rows_to_insert = any ? 1 : range.right_length; @@ -310,7 +310,7 @@ void MergeJoin::leftJoinEquals(const Block & right_block, MutableColumns & right void MergeJoin::innerJoinEquals(const Block & left_block, const Block & right_block, MutableColumns & left_columns, MutableColumns & right_columns, const Range & range) { - bool any = table_join.strictness() == ASTTableJoin::Strictness::Any; + bool any = table_join->strictness() == ASTTableJoin::Strictness::Any; size_t left_rows_to_insert = range.left_length; size_t right_rows_to_insert = any ? 1 : range.right_length; diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 26423395421..393f850a45e 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -18,7 +18,7 @@ struct MergeJoinEqualRange; class MergeJoin : public IJoin { public: - MergeJoin(const AnalyzedJoin & table_join_, const Block & right_sample_block); + MergeJoin(std::shared_ptr table_join_, const Block & right_sample_block); bool addJoinedBlock(const Block & block) override; void joinBlock(Block &) override; @@ -28,7 +28,7 @@ public: private: mutable std::shared_mutex rwlock; - const AnalyzedJoin & table_join; + std::shared_ptr table_join; SortDescription left_sort_description; SortDescription right_sort_description; SortDescription left_merge_description; diff --git a/dbms/src/Storages/StorageJoin.cpp b/dbms/src/Storages/StorageJoin.cpp index 901dd7700d3..d3f41f3b0b5 100644 --- a/dbms/src/Storages/StorageJoin.cpp +++ b/dbms/src/Storages/StorageJoin.cpp @@ -51,7 +51,7 @@ StorageJoin::StorageJoin( throw Exception{"Key column (" + key + ") does not exist in table declaration.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE}; table_join = std::make_shared(limits, use_nulls, kind, strictness, key_names); - join = std::make_shared(*table_join, getSampleBlock().sortColumns(), overwrite); + join = std::make_shared(table_join, getSampleBlock().sortColumns(), overwrite); restore(); } @@ -63,7 +63,7 @@ void StorageJoin::truncate(const ASTPtr &, const Context &, TableStructureWriteL Poco::File(path + "tmp/").createDirectories(); increment = 0; - join = std::make_shared(*table_join, getSampleBlock().sortColumns()); + join = std::make_shared(table_join, getSampleBlock().sortColumns()); } From 1cf47da60d98f0f6aa7c9187e3123e697cefae7e Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 13 Sep 2019 23:23:15 +0800 Subject: [PATCH 047/102] ubsan/memsan fix. --- dbms/src/Common/HashTable/HashTable.h | 14 ++++++++++++-- .../01011_group_uniq_array_memsan.reference | 1 + .../0_stateless/01011_group_uniq_array_memsan.sql | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.reference create mode 100644 dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.sql diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index dadc73c3f44..ade51969bea 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -224,8 +224,18 @@ private: public: bool hasZero() const { return has_zero; } - void setHasZero() { has_zero = true; } - void clearHasZero() { has_zero = false; } + + void setHasZero() + { + has_zero = true; + new (zeroValue()) Cell(); + } + + void clearHasZero() + { + has_zero = false; + zeroValue()->~Cell(); + } Cell * zeroValue() { return reinterpret_cast(&zero_value_storage); } const Cell * zeroValue() const { return reinterpret_cast(&zero_value_storage); } diff --git a/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.reference b/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.reference new file mode 100644 index 00000000000..b7c55c59479 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.reference @@ -0,0 +1 @@ +[[],[2]] diff --git a/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.sql b/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.sql new file mode 100644 index 00000000000..b8c16e48c42 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01011_group_uniq_array_memsan.sql @@ -0,0 +1 @@ +select groupUniqArray(v) from values('id int, v Array(int)', (1, [2]), (1, [])) group by id; From 6f68cfc45c2397f92aa53f15a2bf26112512ee27 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Mon, 16 Sep 2019 18:44:12 +0300 Subject: [PATCH 048/102] Removed (most probably) redundant if branch. Fixed VALUES table funtion. --- dbms/src/Interpreters/convertFieldToType.cpp | 3 --- dbms/src/TableFunctions/TableFunctionValues.cpp | 5 +++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dbms/src/Interpreters/convertFieldToType.cpp b/dbms/src/Interpreters/convertFieldToType.cpp index 7ecbf497704..3104d612e43 100644 --- a/dbms/src/Interpreters/convertFieldToType.cpp +++ b/dbms/src/Interpreters/convertFieldToType.cpp @@ -187,9 +187,6 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID return dynamic_cast(type).castToValue(src); } - if (src.getType() == Field::Types::UInt64) - return src; - if (src.getType() == Field::Types::String) { if (which_type.isDate()) diff --git a/dbms/src/TableFunctions/TableFunctionValues.cpp b/dbms/src/TableFunctions/TableFunctionValues.cpp index ecbe386c382..e5e9eb8a860 100644 --- a/dbms/src/TableFunctions/TableFunctionValues.cpp +++ b/dbms/src/TableFunctions/TableFunctionValues.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -32,7 +33,6 @@ static void parseAndInsertValues(MutableColumns & res_columns, const ASTs & args for (size_t i = 1; i < args.size(); ++i) { const auto & [value_field, value_type_ptr] = evaluateConstantExpression(args[i], context); - Field value = convertFieldToType(value_field, *sample_block.getByPosition(0).type, value_type_ptr.get()); res_columns[0]->insert(value); } @@ -42,6 +42,7 @@ static void parseAndInsertValues(MutableColumns & res_columns, const ASTs & args for (size_t i = 1; i < args.size(); ++i) { const auto & [value_field, value_type_ptr] = evaluateConstantExpression(args[i], context); + const DataTypes & value_types_tuple = typeid_cast(value_type_ptr.get())->getElements(); const TupleBackend & value_tuple = value_field.safeGet().toUnderType(); if (value_tuple.size() != sample_block.columns()) @@ -49,7 +50,7 @@ static void parseAndInsertValues(MutableColumns & res_columns, const ASTs & args for (size_t j = 0; j < value_tuple.size(); ++j) { - Field value = convertFieldToType(value_tuple[j], *sample_block.getByPosition(j).type, value_type_ptr.get()); + Field value = convertFieldToType(value_tuple[j], *sample_block.getByPosition(j).type, value_types_tuple[j].get()); res_columns[j]->insert(value); } } From 6fa925c263ecb0502dfaeb4d3349e63ad6446b3f Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Mon, 16 Sep 2019 18:47:13 +0300 Subject: [PATCH 049/102] Typo. --- dbms/src/TableFunctions/TableFunctionValues.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/TableFunctions/TableFunctionValues.cpp b/dbms/src/TableFunctions/TableFunctionValues.cpp index e5e9eb8a860..30a423a3384 100644 --- a/dbms/src/TableFunctions/TableFunctionValues.cpp +++ b/dbms/src/TableFunctions/TableFunctionValues.cpp @@ -33,6 +33,7 @@ static void parseAndInsertValues(MutableColumns & res_columns, const ASTs & args for (size_t i = 1; i < args.size(); ++i) { const auto & [value_field, value_type_ptr] = evaluateConstantExpression(args[i], context); + Field value = convertFieldToType(value_field, *sample_block.getByPosition(0).type, value_type_ptr.get()); res_columns[0]->insert(value); } From da5d35b34e312c7975576039beb3415d168d83c3 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 16 Sep 2019 22:31:22 +0300 Subject: [PATCH 050/102] partial merge join (minimal tested version) --- dbms/src/Interpreters/MergeJoin.cpp | 218 ++++++++++-------- dbms/src/Interpreters/MergeJoin.h | 17 +- .../01010_partial_merge_join.reference | 124 ++++++++++ .../0_stateless/01010_partial_merge_join.sql | 124 ++++++++++ 4 files changed, 380 insertions(+), 103 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01010_partial_merge_join.reference create mode 100644 dbms/tests/queries/0_stateless/01010_partial_merge_join.sql diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index adb5b4625ee..12c782af205 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -36,6 +36,7 @@ public: {} size_t position() const { return impl.pos; } + size_t end() const { return impl.rows; } bool atEnd() const { return impl.pos >= impl.rows; } void nextN(size_t num) { impl.pos += num; } @@ -100,7 +101,20 @@ private: SortCursorImpl impl; }; -static void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDescription & merge) +namespace +{ + +MutableColumns makeMutableColumns(const Block & block) +{ + MutableColumns columns; + columns.reserve(block.columns()); + + for (const auto & src_column : block) + columns.push_back(src_column.column->cloneEmpty()); + return columns; +} + +void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDescription & merge) { NameSet unique_keys; for (auto & key_name : keys) @@ -115,10 +129,80 @@ static void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDes } } +void copyLeftRange(const Block & block, MutableColumns & columns, size_t start, size_t rows_to_add) +{ + for (size_t i = 0; i < block.columns(); ++i) + { + const auto & src_column = block.getByPosition(i); + auto & dst_column = columns[i]; + + size_t row_pos = start; + for (size_t row = 0; row < rows_to_add; ++row, ++row_pos) + dst_column->insertFrom(*src_column.column, row_pos); + } +} + +void copyRightRange(const Block & right_block, const Block & right_columns_to_add, MutableColumns & columns, + size_t row_position, size_t rows_to_add) +{ + for (size_t i = 0; i < right_columns_to_add.columns(); ++i) + { + const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); + auto & dst_column = columns[i]; + + for (size_t row = 0; row < rows_to_add; ++row) + dst_column->insertFrom(*src_column.column, row_position); + } +} + +void joinEqualsAnyLeft(const Block & right_block, const Block & right_columns_to_add, MutableColumns & right_columns, const Range & range) +{ + copyRightRange(right_block, right_columns_to_add, right_columns, range.right_start, range.left_length); +} + +void joinEquals(const Block & left_block, const Block & right_block, const Block & right_columns_to_add, + MutableColumns & left_columns, MutableColumns & right_columns, const Range & range, bool is_all) +{ + size_t left_rows_to_add = range.left_length; + size_t right_rows_to_add = is_all ? range.right_length : 1; + + size_t row_position = range.right_start; + for (size_t right_row = 0; right_row < right_rows_to_add; ++right_row, ++row_position) + { + copyLeftRange(left_block, left_columns, range.left_start, left_rows_to_add); + copyRightRange(right_block, right_columns_to_add, right_columns, row_position, left_rows_to_add); + } +} + +void appendNulls(MutableColumns & right_columns, size_t rows_to_add) +{ + for (auto & column : right_columns) + for (size_t i = 0; i < rows_to_add; ++i) + column->insertDefault(); +} + +void joinInequalsLeft(const Block & left_block, MutableColumns & left_columns, MutableColumns & right_columns, + size_t start, size_t end, bool copy_left) +{ + if (end <= start) + return; + + size_t rows_to_add = end - start; + if (copy_left) + copyLeftRange(left_block, left_columns, start, rows_to_add); + appendNulls(right_columns, rows_to_add); +} + +} + + MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & right_sample_block) : table_join(table_join_) , nullable_right_side(table_join->forceNullabelRight()) + , is_all(table_join->strictness() == ASTTableJoin::Strictness::All) + , is_inner(isInner(table_join->kind())) + , is_left(isLeft(table_join->kind())) { if (!isLeft(table_join->kind()) && !isInner(table_join->kind())) throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); @@ -149,6 +233,9 @@ void MergeJoin::mergeRightBlocks() { const size_t max_merged_block_size = 128 * 1024 * 1024; + if (right_blocks.empty()) + return; + Blocks unsorted_blocks; unsorted_blocks.reserve(right_blocks.size()); for (const auto & block : right_blocks) @@ -183,26 +270,27 @@ void MergeJoin::joinBlock(Block & block) std::shared_lock lock(rwlock); - if (isLeft(table_join->kind())) - { - MutableColumns right_columns = makeMutableColumns(right_columns_to_add); + MutableColumns left_columns = makeMutableColumns(block); + MutableColumns right_columns = makeMutableColumns(right_columns_to_add); + MergeJoinCursor left_cursor(block, left_merge_description); - MergeJoinCursor left_cursor(block, left_merge_description); + if (is_left) + { for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) { if (left_cursor.atEnd()) break; - leftJoin(left_cursor, *it, right_columns); + leftJoin(left_cursor, block, *it, left_columns, right_columns); } - appendRightColumns(block, std::move(right_columns)); - } - else if (isInner(table_join->kind())) - { - MutableColumns left_columns = makeMutableColumns(block); - MutableColumns right_columns = makeMutableColumns(right_columns_to_add); + joinInequalsLeft(block, left_columns, right_columns, left_cursor.position(), left_cursor.end(), is_all); + //left_cursor.nextN(left_cursor.end() - left_cursor.position()); - MergeJoinCursor left_cursor(block, left_merge_description); + changeLeftColumns(block, std::move(left_columns)); + addRightColumns(block, std::move(right_columns)); + } + else if (is_inner) + { for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it) { if (left_cursor.atEnd()) @@ -210,32 +298,36 @@ void MergeJoin::joinBlock(Block & block) innerJoin(left_cursor, block, *it, left_columns, right_columns); } - block.clear(); - appendRightColumns(block, std::move(left_columns)); - appendRightColumns(block, std::move(right_columns)); + changeLeftColumns(block, std::move(left_columns)); + addRightColumns(block, std::move(right_columns)); } } -void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & right_block, MutableColumns & right_columns) +void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns) { MergeJoinCursor right_cursor(right_block, right_merge_description); while (!left_cursor.atEnd() && !right_cursor.atEnd()) { - size_t left_position = left_cursor.position(); + size_t left_position = left_cursor.position(); /// save inequal position Range range = left_cursor.getNextEqualRange(right_cursor); - if (left_position < range.left_start) - appendRightNulls(right_columns, range.left_start - left_position); + joinInequalsLeft(left_block, left_columns, right_columns, left_position, range.left_start, is_all); if (range.empty()) break; - leftJoinEquals(right_block, right_columns, range); + if (is_all) + joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all); + else + joinEqualsAnyLeft(right_block, right_columns_to_add, right_columns, range); + right_cursor.nextN(range.right_length); - /// TODO: Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) - //if (!right_cursor.atEnd()) + /// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) + if (is_all && right_cursor.atEnd()) + break; left_cursor.nextN(range.left_length); } } @@ -251,26 +343,24 @@ void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_bloc if (range.empty()) break; - innerJoinEquals(left_block, right_block, left_columns, right_columns, range); + joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all); right_cursor.nextN(range.right_length); - /// TODO: Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) - //if (!right_cursor.atEnd()) + /// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) + if (is_all && right_cursor.atEnd()) + break; left_cursor.nextN(range.left_length); } } -MutableColumns MergeJoin::makeMutableColumns(const Block & block) +void MergeJoin::changeLeftColumns(Block & block, MutableColumns && columns) { - MutableColumns columns; - columns.reserve(block.columns()); - - for (const auto & src_column : block) - columns.push_back(src_column.column->cloneEmpty()); - return columns; + if (is_left && !is_all) + return; + block.setColumns(std::move(columns)); } -void MergeJoin::appendRightColumns(Block & block, MutableColumns && right_columns) +void MergeJoin::addRightColumns(Block & block, MutableColumns && right_columns) { for (size_t i = 0; i < right_columns_to_add.columns(); ++i) { @@ -279,64 +369,4 @@ void MergeJoin::appendRightColumns(Block & block, MutableColumns && right_column } } -void MergeJoin::appendRightNulls(MutableColumns & right_columns, size_t rows_to_add) -{ - for (auto & column : right_columns) - for (size_t i = 0; i < rows_to_add; ++i) - column->insertDefault(); -} - -void MergeJoin::leftJoinEquals(const Block & right_block, MutableColumns & right_columns, const Range & range) -{ - bool any = table_join->strictness() == ASTTableJoin::Strictness::Any; - - size_t left_rows_to_insert = range.left_length; - size_t right_rows_to_insert = any ? 1 : range.right_length; - - size_t row_position = range.right_start; - for (size_t right_row = 0; right_row < right_rows_to_insert; ++right_row, ++row_position) - { - for (size_t i = 0; i < right_columns_to_add.columns(); ++i) - { - const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); - auto & dst_column = right_columns[i]; - - for (size_t left_row = 0; left_row < left_rows_to_insert; ++left_row) - dst_column->insertFrom(*src_column.column, row_position); - } - } -} - -void MergeJoin::innerJoinEquals(const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns, const Range & range) -{ - bool any = table_join->strictness() == ASTTableJoin::Strictness::Any; - - size_t left_rows_to_insert = range.left_length; - size_t right_rows_to_insert = any ? 1 : range.right_length; - - size_t row_position = range.right_start; - for (size_t right_row = 0; right_row < right_rows_to_insert; ++right_row, ++row_position) - { - for (size_t i = 0; i < left_block.columns(); ++i) - { - const auto & src_column = left_block.getByPosition(i); - auto & dst_column = left_columns[i]; - - size_t row_pos = range.left_start; - for (size_t row = 0; row < left_rows_to_insert; ++row, ++row_pos) - dst_column->insertFrom(*src_column.column, row_pos); - } - - for (size_t i = 0; i < right_columns_to_add.columns(); ++i) - { - const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); - auto & dst_column = right_columns[i]; - - for (size_t row = 0; row < left_rows_to_insert; ++row) - dst_column->insertFrom(*src_column.column, row_position); - } - } -} - } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 393f850a45e..dbc9191b70d 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -37,22 +37,21 @@ private: Block right_columns_to_add; BlocksList right_blocks; Block totals; - bool nullable_right_side; size_t right_blocks_row_count = 0; size_t right_blocks_bytes = 0; + const bool nullable_right_side; + const bool is_all; + const bool is_inner; + const bool is_left; - MutableColumns makeMutableColumns(const Block & block); - void appendRightColumns(Block & block, MutableColumns && right_columns); + void changeLeftColumns(Block & block, MutableColumns && columns); + void addRightColumns(Block & block, MutableColumns && columns); void mergeRightBlocks(); - void leftJoin(MergeJoinCursor & left_cursor, const Block & right_block, MutableColumns & right_columns); + void leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, + MutableColumns & left_columns, MutableColumns & right_columns); void innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, MutableColumns & left_columns, MutableColumns & right_columns); - - void appendRightNulls(MutableColumns & right_columns, size_t rows_to_add); - void leftJoinEquals(const Block & right_block, MutableColumns & right_columns, const MergeJoinEqualRange & range); - void innerJoinEquals(const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns, const MergeJoinEqualRange & range); }; } diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference b/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference new file mode 100644 index 00000000000..ee418e73b65 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference @@ -0,0 +1,124 @@ +t join none using +0 0 0 +- +0 0 0 +- +- +t join none on +0 0 0 0 +- +0 0 0 0 +- +- +none join t using +none join t on +/none +t join none using +0 0 \N +- +0 0 \N +- +- +t join none on +0 0 \N \N +- +0 0 \N \N +- +- +none join t using +none join t on +/none +any left +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +- +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +all left +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql b/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql new file mode 100644 index 00000000000..6db701f6197 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql @@ -0,0 +1,124 @@ +DROP TABLE IF EXISTS t0; +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t0 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t1 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t2 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); + +INSERT INTO t1 (x, y) VALUES (0, 0); + +SET partial_merge_join = 1; +SET any_join_distinct_right_table_keys = 1; + +SELECT 't join none using'; +SELECT * FROM t1 ANY LEFT JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 LEFT JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 ANY INNER JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 INNER JOIN t0 USING (x) ORDER BY x; +SELECT 't join none on'; +SELECT * FROM t1 ANY LEFT JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 LEFT JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 ANY INNER JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 INNER JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT 'none join t using'; +SELECT * FROM t0 ANY LEFT JOIN t1 USING (x); +SELECT * FROM t0 LEFT JOIN t1 USING (x); +SELECT * FROM t0 ANY INNER JOIN t1 USING (x); +SELECT * FROM t0 INNER JOIN t1 USING (x); +SELECT 'none join t on'; +SELECT * FROM t0 ANY LEFT JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 LEFT JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 ANY INNER JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 INNER JOIN t1 ON t1.x = t0.x; +SELECT '/none'; + +SET join_use_nulls = 1; + +SELECT 't join none using'; +SELECT * FROM t1 ANY LEFT JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 LEFT JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 ANY INNER JOIN t0 USING (x) ORDER BY x; +SELECT '-'; +SELECT * FROM t1 INNER JOIN t0 USING (x) ORDER BY x; +SELECT 't join none on'; +SELECT * FROM t1 ANY LEFT JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 LEFT JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 ANY INNER JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT '-'; +SELECT * FROM t1 INNER JOIN t0 ON t1.x = t0.x ORDER BY x; +SELECT 'none join t using'; +SELECT * FROM t0 ANY LEFT JOIN t1 USING (x); +SELECT * FROM t0 LEFT JOIN t1 USING (x); +SELECT * FROM t0 ANY INNER JOIN t1 USING (x); +SELECT * FROM t0 INNER JOIN t1 USING (x); +SELECT 'none join t on'; +SELECT * FROM t0 ANY LEFT JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 LEFT JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 ANY INNER JOIN t1 ON t1.x = t0.x; +SELECT * FROM t0 INNER JOIN t1 ON t1.x = t0.x; +SELECT '/none'; + +INSERT INTO t1 (x, y) VALUES (1, 10) (2, 20); +INSERT INTO t1 (x, y) VALUES (4, 40) (3, 30); + +INSERT INTO t2 (x, y) VALUES (4, 41) (2, 21) (2, 22); +INSERT INTO t2 (x, y) VALUES (0, 0) (5, 50) (4, 42); + +SET join_use_nulls = 0; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +-- TODO: SET join_use_nulls = 1; + +DROP TABLE t0; +DROP TABLE t1; +DROP TABLE t2; From 2252ee5a9012b39891a2d09e2e21fef6d75a8a67 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Tue, 17 Sep 2019 11:17:46 +0300 Subject: [PATCH 051/102] Branch refactoring. --- dbms/src/Interpreters/convertFieldToType.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/src/Interpreters/convertFieldToType.cpp b/dbms/src/Interpreters/convertFieldToType.cpp index 3104d612e43..8bc6ee5b9d5 100644 --- a/dbms/src/Interpreters/convertFieldToType.cpp +++ b/dbms/src/Interpreters/convertFieldToType.cpp @@ -187,6 +187,12 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID return dynamic_cast(type).castToValue(src); } + if (which_type.isDateOrDateTime() && src.getType() == Field::Types::UInt64) + { + /// We don't need any conversion UInt64 is under type of Date and DateTime + return src; + } + if (src.getType() == Field::Types::String) { if (which_type.isDate()) From 33c3f7a6a29e6cb7cf2b3217dfdd9c06f3190927 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 12:49:27 +0300 Subject: [PATCH 052/102] Add gdb-index for gold linker with gdb-add-index tool --- CMakeLists.txt | 29 ++++++++++++------------ dbms/programs/CMakeLists.txt | 3 +++ dbms/programs/odbc-bridge/CMakeLists.txt | 4 ++++ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e20b1fa7ab..e7c8baac1a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,9 +156,22 @@ endif () # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") +option (ADD_GDB_INDEX_FOR_GOLD "Set to add .gdb-index to resulting binaries for gold linker. NOOP if lld is used." 0) if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") + if (LINKER_NAME STREQUAL "lld") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") + message (STATUS "Adding .gdb-index via --gdb-index linker option.") + elseif (LINKER_NAME STREQUAL "gold" AND ADD_GDB_INDEX_FOR_GOLD) + find_program (GDB_ADD_INDEX_EXE NAMES "gdb-add-index" DOC "Path to gdb-add-index executable") + if (NOT GDB_ADD_INDEX_EXE) + set (USE_GDB_ADD_INDEX 0) + message (WARNING "Cannot add gdb index to binaries, because gold linker is used, but gdb-add-index executable not found.") + else() + set (USE_GDB_ADD_INDEX 1) + message (STATUS "gdb-add-index found: ${GDB_ADD_INDEX_EXE}") + endif() + endif () endif() cmake_host_system_information(RESULT AVAILABLE_PHYSICAL_MEMORY QUERY AVAILABLE_PHYSICAL_MEMORY) # Not available under freebsd @@ -295,18 +308,6 @@ else () set (CLICKHOUSE_ETC_DIR "${CMAKE_INSTALL_PREFIX}/etc") endif () -option (UNBUNDLED "Try find all libraries in system. We recommend to avoid this mode for production builds, because we cannot guarantee exact versions and variants of libraries your system has installed. This mode exists for enthusiastic developers who search for trouble. Also it is useful for maintainers of OS packages." OFF) -if (UNBUNDLED) - set(NOT_UNBUNDLED 0) -else () - set(NOT_UNBUNDLED 1) -endif () - -# Using system libs can cause lot of warnings in includes. -if (UNBUNDLED OR NOT (OS_LINUX OR APPLE) OR ARCH_32) - option (NO_WERROR "Disable -Werror compiler option" ON) -endif () - message (STATUS "Building for: ${CMAKE_SYSTEM} ${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_LIBRARY_ARCHITECTURE} ; USE_STATIC_LIBRARIES=${USE_STATIC_LIBRARIES} MAKE_STATIC_LIBRARIES=${MAKE_STATIC_LIBRARIES} SPLIT_SHARED=${SPLIT_SHARED_LIBRARIES} UNBUNDLED=${UNBUNDLED} CCACHE=${CCACHE_FOUND} ${CCACHE_VERSION}") include(GNUInstallDirs) diff --git a/dbms/programs/CMakeLists.txt b/dbms/programs/CMakeLists.txt index 1967aee2c09..bac3269468e 100644 --- a/dbms/programs/CMakeLists.txt +++ b/dbms/programs/CMakeLists.txt @@ -205,6 +205,9 @@ else () add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_BUNDLE}) + if (USE_GDB_ADD_INDEX) + add_custom_command(TARGET clickhouse POST_BUILD COMMAND ${GDB_ADD_INDEX_EXE} clickhouse COMMENT "Adding .gdb-index to clickhouse" VERBATIM) + endif() endif () if (TARGET clickhouse-server AND TARGET copy-headers) diff --git a/dbms/programs/odbc-bridge/CMakeLists.txt b/dbms/programs/odbc-bridge/CMakeLists.txt index 73574f8dc2e..c7a739ac145 100644 --- a/dbms/programs/odbc-bridge/CMakeLists.txt +++ b/dbms/programs/odbc-bridge/CMakeLists.txt @@ -42,6 +42,10 @@ set_target_properties(clickhouse-odbc-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY clickhouse_program_link_split_binary(odbc-bridge) +if (USE_GDB_ADD_INDEX) + add_custom_command(TARGET clickhouse-odbc-bridge POST_BUILD COMMAND ${GDB_ADD_INDEX_EXE} ../clickhouse-odbc-bridge COMMENT "Adding .gdb-index to clickhouse-odbc-bridge" VERBATIM) +endif() + install(TARGETS clickhouse-odbc-bridge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) if(ENABLE_TESTS) From 95f523ba4dbdb31e49a42b4b89db10654d1fe7b7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 12:53:12 +0300 Subject: [PATCH 053/102] Add comment --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7c8baac1a9..53635ed3a05 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -162,6 +162,8 @@ if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") message (STATUS "Adding .gdb-index via --gdb-index linker option.") + # we use another tool for gdb-index, because gold linker removes section .debug_aranges, which used inside clickhouse stacktraces + # http://sourceware-org.1504.n7.nabble.com/gold-No-debug-aranges-section-when-linking-with-gdb-index-td540965.html#a556932 elseif (LINKER_NAME STREQUAL "gold" AND ADD_GDB_INDEX_FOR_GOLD) find_program (GDB_ADD_INDEX_EXE NAMES "gdb-add-index" DOC "Path to gdb-add-index executable") if (NOT GDB_ADD_INDEX_EXE) From f1d86d55e8c3e1d4a5e4b25d74367a83992da4cb Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 13:16:48 +0300 Subject: [PATCH 054/102] Enable gdb-index for gold by default --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index d0d46fd5f51..733cbff71d1 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -105,7 +105,7 @@ def run_vagrant_box_with_env(image_path, output_dir, ch_root): def parse_env_variables(build_type, compiler, sanitizer, package_type, cache, distcc_hosts, unbundled, split_binary, version, author, official, alien_pkgs, with_coverage): result = [] - cmake_flags = ['$CMAKE_FLAGS'] + cmake_flags = ['$CMAKE_FLAGS', '-DADD_GDB_INDEX_FOR_GOLD=1'] cc = compiler cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++') From 5d08a7b353ba07630f80519580981690bb902344 Mon Sep 17 00:00:00 2001 From: proller Date: Tue, 17 Sep 2019 13:37:35 +0300 Subject: [PATCH 055/102] Remove gcc-7 from build scripts --- contrib/libunwind-cmake/CMakeLists.txt | 1 + debian/.pbuilderrc | 7 +++---- debian/changelog | 4 ++-- debian/control | 2 +- debian/pbuilder-hooks/C99kill-make | 2 +- debian/rules | 3 +++ libs/libglibc-compatibility/CMakeLists.txt | 4 ++-- release | 6 +++--- utils/build/build_debian.sh | 4 +--- utils/build/build_macos.sh | 2 +- 10 files changed, 18 insertions(+), 17 deletions(-) diff --git a/contrib/libunwind-cmake/CMakeLists.txt b/contrib/libunwind-cmake/CMakeLists.txt index f09d0979692..c3a9d20b16f 100644 --- a/contrib/libunwind-cmake/CMakeLists.txt +++ b/contrib/libunwind-cmake/CMakeLists.txt @@ -4,6 +4,7 @@ set(LIBUNWIND_CXX_SOURCES ${LIBUNWIND_SOURCE_DIR}/src/libunwind.cpp ${LIBUNWIND_SOURCE_DIR}/src/Unwind-EHABI.cpp ${LIBUNWIND_SOURCE_DIR}/src/Unwind-seh.cpp) + if (APPLE) set(LIBUNWIND_CXX_SOURCES ${LIBUNWIND_CXX_SOURCES} ${LIBUNWIND_SOURCE_DIR}/src/Unwind_AppleExtras.cpp) endif () diff --git a/debian/.pbuilderrc b/debian/.pbuilderrc index 11c733f1056..9449be7c7d4 100644 --- a/debian/.pbuilderrc +++ b/debian/.pbuilderrc @@ -11,11 +11,10 @@ # sudo ln -s gutsy /usr/share/debootstrap/scripts/bionic # sudo ln -s sid /usr/share/debootstrap/scripts/buster # build ubuntu: -# sudo DIST=trusty pbuilder create --configfile debian/.pbuilderrc && DIST=trusty pdebuild --configfile debian/.pbuilderrc -# sudo DIST=xenial pbuilder create --configfile debian/.pbuilderrc && DIST=xenial pdebuild --configfile debian/.pbuilderrc -# sudo DIST=zesty pbuilder create --configfile debian/.pbuilderrc && DIST=zesty pdebuild --configfile debian/.pbuilderrc -# sudo DIST=artful pbuilder create --configfile debian/.pbuilderrc && DIST=artful pdebuild --configfile debian/.pbuilderrc # sudo DIST=bionic pbuilder create --configfile debian/.pbuilderrc && DIST=bionic pdebuild --configfile debian/.pbuilderrc +# sudo DIST=cosmic pbuilder create --configfile debian/.pbuilderrc && DIST=cosmic pdebuild --configfile debian/.pbuilderrc +# sudo DIST=disco pbuilder create --configfile debian/.pbuilderrc && DIST=disco pdebuild --configfile debian/.pbuilderrc +# sudo DIST=eoan pbuilder create --configfile debian/.pbuilderrc && DIST=eoan pdebuild --configfile debian/.pbuilderrc # sudo DIST=devel pbuilder create --configfile debian/.pbuilderrc && DIST=devel pdebuild --configfile debian/.pbuilderrc # build debian: # sudo DIST=stable pbuilder create --configfile debian/.pbuilderrc && DIST=stable pdebuild --configfile debian/.pbuilderrc diff --git a/debian/changelog b/debian/changelog index 563be7c48eb..f562432c4af 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (19.15.1.1) unstable; urgency=low +clickhouse (19.15.1) unstable; urgency=low * Modified source code - -- clickhouse-release Fri, 06 Sep 2019 17:58:30 +0300 + -- proller Wed, 11 Sep 2019 15:54:44 +0300 diff --git a/debian/control b/debian/control index ae1a8d498a6..3ce12b504c2 100644 --- a/debian/control +++ b/debian/control @@ -5,7 +5,7 @@ Maintainer: Alexey Milovidov Build-Depends: debhelper (>= 9), cmake | cmake3, ninja-build, - gcc-7 [amd64 i386] | gcc-8 [amd64 i386] | gcc-9 [amd64 i386], g++-7 [amd64 i386] | g++-8 [amd64 i386] | g++-9 [amd64 i386], + gcc-9 [amd64 i386] | gcc-8 [amd64 i386], g++-9 [amd64 i386] | g++-8 [amd64 i386], clang-8 [arm64 armhf] | clang-7 [arm64 armhf] | clang-6.0 [arm64 armhf], libc6-dev, libicu-dev, diff --git a/debian/pbuilder-hooks/C99kill-make b/debian/pbuilder-hooks/C99kill-make index 60be8e0d402..2068e75dc40 100755 --- a/debian/pbuilder-hooks/C99kill-make +++ b/debian/pbuilder-hooks/C99kill-make @@ -2,4 +2,4 @@ # Try stop parallel build after timeout -killall make gcc gcc-7 g++-7 gcc-8 g++-8 clang clang-5.0 clang++-5.0 clang-6.0 clang++-6.0 clang-7 clang++-7 ||: +killall make gcc gcc-8 g++-8 gcc-9 g++-9 clang clang-6.0 clang++-6.0 clang-7 clang++-7 ||: diff --git a/debian/rules b/debian/rules index c21f0999bbc..ee5a3ffa384 100755 --- a/debian/rules +++ b/debian/rules @@ -32,6 +32,9 @@ endif CMAKE_FLAGS += -DENABLE_UTILS=0 +DEB_CC ?= $(shell which gcc-9 gcc-8 gcc | head -n1) +DEB_CXX ?= $(shell which g++-9 g++-8 g++ | head -n1) + ifdef DEB_CXX DEB_BUILD_GNU_TYPE := $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE) DEB_HOST_GNU_TYPE := $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE) diff --git a/libs/libglibc-compatibility/CMakeLists.txt b/libs/libglibc-compatibility/CMakeLists.txt index a62f5e75e17..9ceec6a8dee 100644 --- a/libs/libglibc-compatibility/CMakeLists.txt +++ b/libs/libglibc-compatibility/CMakeLists.txt @@ -27,8 +27,8 @@ if (GLIBC_COMPATIBILITY) target_include_directories(glibc-compatibility PRIVATE libcxxabi) - if (USE_STATIC_LIBRARIES=0 AND MAKE_STATIC_LIBRARIES=OFF) - target_compile_options(PRIVATE -fPIC) + if (NOT USE_STATIC_LIBRARIES AND NOT MAKE_STATIC_LIBRARIES) + target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () target_link_libraries(global-libs INTERFACE glibc-compatibility) diff --git a/release b/release index 758e346acc9..270c16f4c36 100755 --- a/release +++ b/release @@ -66,7 +66,7 @@ do shift elif [[ $1 == '--fast' ]]; then # Wrong but fast pbuilder mode: create base package with all depends - EXTRAPACKAGES="$EXTRAPACKAGES debhelper cmake ninja-build gcc-7 g++-7 libc6-dev libicu-dev libreadline-dev psmisc bash expect python python-lxml python-termcolor python-requests curl perl sudo openssl netcat-openbsd" + EXTRAPACKAGES="$EXTRAPACKAGES debhelper cmake ninja-build gcc-8 g++-8 libc6-dev libicu-dev libreadline-dev psmisc bash expect python python-lxml python-termcolor python-requests curl perl sudo openssl netcat-openbsd" shift elif [[ $1 == '--rpm' ]]; then MAKE_RPM=1 @@ -116,8 +116,8 @@ echo -e "\nCurrent version is $VERSION_STRING" if [ -z "$NO_BUILD" ] ; then gen_changelog "$VERSION_STRING" "" "$AUTHOR" "" if [ -z "$USE_PBUILDER" ] ; then - DEB_CC=${DEB_CC:=`which gcc-7 gcc-8 gcc | head -n1`} - DEB_CXX=${DEB_CXX:=`which g++-7 g++-8 g++ | head -n1`} + DEB_CC=${DEB_CC:=`which gcc-9 gcc-8 gcc | head -n1`} + DEB_CXX=${DEB_CXX:=`which g++-9 g++-8 g++ | head -n1`} # Build (only binary packages). debuild --preserve-env -e PATH \ -e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \ diff --git a/utils/build/build_debian.sh b/utils/build/build_debian.sh index 24156377bbe..26997b46d78 100755 --- a/utils/build/build_debian.sh +++ b/utils/build/build_debian.sh @@ -6,9 +6,7 @@ # curl https://raw.githubusercontent.com/yandex/ClickHouse/master/utils/build/build_debian.sh | sh # install compiler and libs -sudo apt install -y git bash cmake ninja-build libicu-dev libreadline-dev gperf -sudo apt install -y gcc-9 g++-9 ||: -[ -z `which g++-9` ] && sudo apt install -y gcc-8 g++-8 ||: +sudo apt install -y git bash cmake ninja-build gcc-8 g++-8 libicu-dev libreadline-dev gperf # for -DUNBUNDLED=1 mode: #sudo apt install -y libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libzstd-dev libre2-dev libsparsehash-dev librdkafka-dev libcapnp-dev libpoco-dev libsparsehash-dev libgoogle-perftools-dev libunwind-dev googletest libcctz-dev diff --git a/utils/build/build_macos.sh b/utils/build/build_macos.sh index aa1b1a039b0..0e9bed37aa2 100755 --- a/utils/build/build_macos.sh +++ b/utils/build/build_macos.sh @@ -37,7 +37,7 @@ fi mkdir build cd build -cmake .. -DCMAKE_CXX_COMPILER=`which g++-9 g++-8 g++-7` -DCMAKE_C_COMPILER=`which gcc-9 gcc-8 gcc-7` +cmake .. -DCMAKE_CXX_COMPILER=`which g++-9 g++-8` -DCMAKE_C_COMPILER=`which gcc-9 gcc-8` cmake --build . cd .. From e7f069c78cc859fca9d9ab696ed7820afae8d9ea Mon Sep 17 00:00:00 2001 From: proller Date: Tue, 17 Sep 2019 12:33:46 +0000 Subject: [PATCH 056/102] fixes --- contrib/libunwind-cmake/CMakeLists.txt | 1 - dbms/programs/client/CMakeLists.txt | 6 +++++- debian/changelog | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/contrib/libunwind-cmake/CMakeLists.txt b/contrib/libunwind-cmake/CMakeLists.txt index c3a9d20b16f..f09d0979692 100644 --- a/contrib/libunwind-cmake/CMakeLists.txt +++ b/contrib/libunwind-cmake/CMakeLists.txt @@ -4,7 +4,6 @@ set(LIBUNWIND_CXX_SOURCES ${LIBUNWIND_SOURCE_DIR}/src/libunwind.cpp ${LIBUNWIND_SOURCE_DIR}/src/Unwind-EHABI.cpp ${LIBUNWIND_SOURCE_DIR}/src/Unwind-seh.cpp) - if (APPLE) set(LIBUNWIND_CXX_SOURCES ${LIBUNWIND_CXX_SOURCES} ${LIBUNWIND_SOURCE_DIR}/src/Unwind_AppleExtras.cpp) endif () diff --git a/dbms/programs/client/CMakeLists.txt b/dbms/programs/client/CMakeLists.txt index 88e23a094d1..c9996de1e9b 100644 --- a/dbms/programs/client/CMakeLists.txt +++ b/dbms/programs/client/CMakeLists.txt @@ -4,7 +4,11 @@ set(CLICKHOUSE_CLIENT_SOURCES ) set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io clickhouse_parsers string_utils ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY}) -set(CLICKHOUSE_CLIENT_INCLUDE SYSTEM PRIVATE ${READLINE_INCLUDE_DIR} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include) +set(CLICKHOUSE_CLIENT_INCLUDE PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include) + +if (READLINE_INCLUDE_DIR) + set(CLICKHOUSE_CLIENT_INCLUDE ${CLICKHOUSE_CLIENT_INCLUDE} SYSTEM PRIVATE ${READLINE_INCLUDE_DIR}) +endif () include(CheckSymbolExists) check_symbol_exists(readpassphrase readpassphrase.h HAVE_READPASSPHRASE) diff --git a/debian/changelog b/debian/changelog index f562432c4af..563be7c48eb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (19.15.1) unstable; urgency=low +clickhouse (19.15.1.1) unstable; urgency=low * Modified source code - -- proller Wed, 11 Sep 2019 15:54:44 +0300 + -- clickhouse-release Fri, 06 Sep 2019 17:58:30 +0300 From 592b132b1a593793d51828fc4a110e3a4ca675af Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 17 Sep 2019 14:38:28 +0200 Subject: [PATCH 057/102] Add two more upcoming meetups --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ab96a293aeb..961f35e6ea8 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,6 @@ ClickHouse is an open-source column-oriented database management system that all * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. * [ClickHouse Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27. * [ClickHouse Meetup in Tokyo](https://clickhouse.connpass.com/event/147001/) on November 14. +* [ClickHouse Meetup in Istanbul](https://www.eventbrite.com/e/clickhouse-meetup-istanbul-create-blazing-fast-experiences-w-clickhouse-tickets-73101120419) on November 19. +* [ClickHouse Meetup in Ankara](https://www.eventbrite.com/e/clickhouse-meetup-ankara-create-blazing-fast-experiences-w-clickhouse-tickets-73100530655) on November 21. + From aab95fd5175a513413c7395a73a82044bdafb906 Mon Sep 17 00:00:00 2001 From: malkfilipp Date: Tue, 17 Sep 2019 15:41:25 +0300 Subject: [PATCH 058/102] Bump the test number --- ...w_tables_limit.reference => 01012_show_tables_limit.reference} | 0 .../{01011_show_tables_limit.sql => 01012_show_tables_limit.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename dbms/tests/queries/0_stateless/{01011_show_tables_limit.reference => 01012_show_tables_limit.reference} (100%) rename dbms/tests/queries/0_stateless/{01011_show_tables_limit.sql => 01012_show_tables_limit.sql} (100%) diff --git a/dbms/tests/queries/0_stateless/01011_show_tables_limit.reference b/dbms/tests/queries/0_stateless/01012_show_tables_limit.reference similarity index 100% rename from dbms/tests/queries/0_stateless/01011_show_tables_limit.reference rename to dbms/tests/queries/0_stateless/01012_show_tables_limit.reference diff --git a/dbms/tests/queries/0_stateless/01011_show_tables_limit.sql b/dbms/tests/queries/0_stateless/01012_show_tables_limit.sql similarity index 100% rename from dbms/tests/queries/0_stateless/01011_show_tables_limit.sql rename to dbms/tests/queries/0_stateless/01012_show_tables_limit.sql From e9b7bd7ed749e2fc863fe939f03a46b47aee0a9b Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 17:24:45 +0300 Subject: [PATCH 059/102] Add gdb to build images --- docker/packager/binary/Dockerfile | 3 ++- docker/packager/deb/Dockerfile | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 5cb3808fab9..78f113d94b4 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -55,7 +55,8 @@ RUN apt-get update -y \ git \ tzdata \ gperf \ - cmake + cmake \ + gdb COPY build.sh / CMD ["/bin/bash", "/build.sh"] diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index a48fdbd8895..c55c1f71800 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -78,8 +78,10 @@ RUN apt-get --allow-unauthenticated update -y \ gperf \ alien \ libcapnp-dev \ - cmake + cmake \ + gdb COPY build.sh / + CMD ["/bin/bash", "/build.sh"] From dccac1993f7e0a45b31a82b9af48c8b7fa177380 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 19:42:18 +0300 Subject: [PATCH 060/102] Add custom dpkg to deb build image --- docker/packager/deb/Dockerfile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index c55c1f71800..6a41d355453 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -79,9 +79,17 @@ RUN apt-get --allow-unauthenticated update -y \ alien \ libcapnp-dev \ cmake \ - gdb + gdb \ + pigz +# Special dpkg-deb (https://github.com/alesapin/dpkg) version which is able +# to compress files using pigz (https://zlib.net/pigz/) instead of gzip. +# Significantly increase deb packaging speed and compatible with old systems +RUN curl -O https://clickhouse-builds.s3.yandex.net/utils/dpkg-deb +RUN chmod +x dpkg-deb +RUN cp dpkg-deb /usr/bin + COPY build.sh / CMD ["/bin/bash", "/build.sh"] From 58fad789806d1eec2bb67f4dc3721c83b4b4f0a6 Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 17 Sep 2019 19:55:11 +0300 Subject: [PATCH 061/102] support MergeJoin nullable convertion (right side) --- dbms/src/Columns/ColumnNullable.cpp | 12 +++ dbms/src/Columns/ColumnNullable.h | 3 + dbms/src/Interpreters/MergeJoin.cpp | 28 +++--- .../01010_partial_merge_join.reference | 94 +++++++++++++++++++ .../0_stateless/01010_partial_merge_join.sql | 42 ++++++++- 5 files changed, 166 insertions(+), 13 deletions(-) diff --git a/dbms/src/Columns/ColumnNullable.cpp b/dbms/src/Columns/ColumnNullable.cpp index 716b583f9a2..e9655310452 100644 --- a/dbms/src/Columns/ColumnNullable.cpp +++ b/dbms/src/Columns/ColumnNullable.cpp @@ -156,6 +156,18 @@ void ColumnNullable::insertFrom(const IColumn & src, size_t n) getNullMapData().push_back(src_concrete.getNullMapData()[n]); } +void ColumnNullable::insertFromNotNullable(const IColumn & src, size_t n) +{ + getNestedColumn().insertFrom(src, n); + getNullMapData().push_back(0); +} + +void ColumnNullable::insertRangeFromNotNullable(const IColumn & src, size_t start, size_t length) +{ + getNestedColumn().insertRangeFrom(src, start, length); + getNullMapData().resize_fill(getNullMapData().size() + length, 0); +} + void ColumnNullable::popBack(size_t n) { getNestedColumn().popBack(n); diff --git a/dbms/src/Columns/ColumnNullable.h b/dbms/src/Columns/ColumnNullable.h index f93a227abae..083d6f64052 100644 --- a/dbms/src/Columns/ColumnNullable.h +++ b/dbms/src/Columns/ColumnNullable.h @@ -61,6 +61,9 @@ public: void insert(const Field & x) override; void insertFrom(const IColumn & src, size_t n) override; + void insertFromNotNullable(const IColumn & src, size_t n); + void insertRangeFromNotNullable(const IColumn & src, size_t start, size_t length); + void insertDefault() override { getNestedColumn().insertDefault(); diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 12c782af205..6d6577d626e 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -104,13 +105,16 @@ private: namespace { -MutableColumns makeMutableColumns(const Block & block) +MutableColumns makeMutableColumns(const Block & block, size_t rows_to_reserve = 0) { MutableColumns columns; columns.reserve(block.columns()); for (const auto & src_column : block) + { columns.push_back(src_column.column->cloneEmpty()); + columns.back()->reserve(rows_to_reserve); + } return columns; } @@ -133,12 +137,8 @@ void copyLeftRange(const Block & block, MutableColumns & columns, size_t start, { for (size_t i = 0; i < block.columns(); ++i) { - const auto & src_column = block.getByPosition(i); - auto & dst_column = columns[i]; - - size_t row_pos = start; - for (size_t row = 0; row < rows_to_add; ++row, ++row_pos) - dst_column->insertFrom(*src_column.column, row_pos); + const auto & src_column = block.getByPosition(i).column; + columns[i]->insertRangeFrom(*src_column, start, rows_to_add); } } @@ -147,11 +147,14 @@ void copyRightRange(const Block & right_block, const Block & right_columns_to_ad { for (size_t i = 0; i < right_columns_to_add.columns(); ++i) { - const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name); + const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name).column; auto & dst_column = columns[i]; + auto * dst_nullable = typeid_cast(dst_column.get()); - for (size_t row = 0; row < rows_to_add; ++row) - dst_column->insertFrom(*src_column.column, row_position); + if (dst_nullable && !isColumnNullable(*src_column)) + dst_nullable->insertRangeFromNotNullable(*src_column, row_position, rows_to_add); + else + dst_column->insertRangeFrom(*src_column, row_position, rows_to_add); } } @@ -270,8 +273,9 @@ void MergeJoin::joinBlock(Block & block) std::shared_lock lock(rwlock); - MutableColumns left_columns = makeMutableColumns(block); - MutableColumns right_columns = makeMutableColumns(right_columns_to_add); + size_t rows_to_reserve = is_left ? block.rows() : 0; + MutableColumns left_columns = makeMutableColumns(block, (is_all ? rows_to_reserve : 0)); + MutableColumns right_columns = makeMutableColumns(right_columns_to_add, rows_to_reserve); MergeJoinCursor left_cursor(block, left_merge_description); if (is_left) diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference b/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference index ee418e73b65..5e629a2eb43 100644 --- a/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join.reference @@ -122,3 +122,97 @@ all inner 2 20 2 22 4 40 4 41 4 40 4 42 +any left +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +- +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +all left +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql b/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql index 6db701f6197..99bba62b48e 100644 --- a/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join.sql @@ -117,7 +117,47 @@ SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y, SELECT '-'; SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; --- TODO: SET join_use_nulls = 1; +SET join_use_nulls = 1; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; DROP TABLE t0; DROP TABLE t1; From fb1b8093887630a6324684270b6352d17fb13c0b Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 20:07:35 +0300 Subject: [PATCH 062/102] Original repo --- docker/packager/deb/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index 6a41d355453..7ce98ba8fcc 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -83,7 +83,7 @@ RUN apt-get --allow-unauthenticated update -y \ pigz -# Special dpkg-deb (https://github.com/alesapin/dpkg) version which is able +# Special dpkg-deb (https://github.com/ClickHouse-Extras/dpkg) version which is able # to compress files using pigz (https://zlib.net/pigz/) instead of gzip. # Significantly increase deb packaging speed and compatible with old systems RUN curl -O https://clickhouse-builds.s3.yandex.net/utils/dpkg-deb From 0a97359f9e3a24f3049c84f089634e543eae27be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20St=C3=A4ber?= Date: Tue, 17 Sep 2019 20:00:17 +0200 Subject: [PATCH 063/102] fix typo arbitary -> arbitrary I hereby agree to the terms of the CLA available at: I hereby agree to the terms of the CLA available at: --- docs/en/development/architecture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index e851eef47d1..3f98136e68c 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -163,7 +163,7 @@ There is no global query plan for distributed query execution. Each node has its ## Merge Tree -`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitary tuple of columns or expressions. Data in a `MergeTree` table is stored in "parts". Each part stores data in the primary key order (data is ordered lexicographically by the primary key tuple). All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the order is defined by the primary key), so when you iterate by many columns, you get values for the corresponding rows. +`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitrary tuple of columns or expressions. Data in a `MergeTree` table is stored in "parts". Each part stores data in the primary key order (data is ordered lexicographically by the primary key tuple). All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the order is defined by the primary key), so when you iterate by many columns, you get values for the corresponding rows. The primary key itself is "sparse". It doesn't address each single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with "marks," which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory and data for `column.mrk` files is cached. From ab075290a169d0848654576da5eda81b50076ed7 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 17 Sep 2019 20:29:50 +0200 Subject: [PATCH 064/102] Remove link to (almost) past meetup --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 961f35e6ea8..c9a9ed39696 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ ClickHouse is an open-source column-oriented database management system that all * You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person. ## Upcoming Events -* [ClickHouse Meetup in Munich](https://www.meetup.com/ClickHouse-Meetup-Munich/events/264185199/) on September 17. * [ClickHouse Meetup in Paris](https://www.eventbrite.com/e/clickhouse-paris-meetup-2019-registration-68493270215) on October 3. * [ClickHouse Meetup in Hong Kong](https://www.meetup.com/Hong-Kong-Machine-Learning-Meetup/events/263580542/) on October 17. * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. From 2b3a4b06507a5a776ed5dad0adeeeee11fe5447f Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 17 Sep 2019 21:53:52 +0300 Subject: [PATCH 065/102] support const and LC columns in partial merge-join --- dbms/src/Interpreters/IJoin.cpp | 10 ++++++++ dbms/src/Interpreters/IJoin.h | 1 + dbms/src/Interpreters/MergeJoin.cpp | 25 +++++++++++++++---- dbms/src/Interpreters/MergeJoin.h | 4 +-- ..._partial_merge_join_const_and_lc.reference | 5 ++++ .../01010_partial_merge_join_const_and_lc.sql | 7 ++++++ 6 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.reference create mode 100644 dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.sql diff --git a/dbms/src/Interpreters/IJoin.cpp b/dbms/src/Interpreters/IJoin.cpp index 84c014efe22..9e4ded0fb90 100644 --- a/dbms/src/Interpreters/IJoin.cpp +++ b/dbms/src/Interpreters/IJoin.cpp @@ -48,6 +48,16 @@ ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & nam return ptrs; } +void removeLowCardinalityInplace(Block & block) +{ + for (size_t i = 0; i < block.columns(); ++i) + { + auto & col = block.getByPosition(i); + col.column = recursiveRemoveLowCardinality(col.column); + col.type = recursiveRemoveLowCardinality(col.type); + } +} + ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, Block & sample_block_with_keys, Block & sample_block_with_columns_to_add) { diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index 990e88a51f6..5e16e25c58e 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -46,6 +46,7 @@ namespace JoinCommon void convertColumnToNullable(ColumnWithTypeAndName & column); void convertColumnsToNullable(Block & block, size_t starting_pos = 0); ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & names, Columns & materialized); +void removeLowCardinalityInplace(Block & block); /// Split key and other columns by keys name list ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 6d6577d626e..93a8cf2e4c5 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -217,6 +217,7 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & ri if (required_right_keys.count(column.name)) right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name}); + JoinCommon::removeLowCardinalityInplace(right_columns_to_add); JoinCommon::createMissedColumns(right_columns_to_add); if (nullable_right_side) @@ -254,7 +255,9 @@ void MergeJoin::mergeRightBlocks() bool MergeJoin::addJoinedBlock(const Block & src_block) { - Block block = src_block; + Block block = materializeBlock(src_block); + JoinCommon::removeLowCardinalityInplace(block); + sortBlock(block, right_sort_description); std::unique_lock lock(rwlock); @@ -269,6 +272,9 @@ bool MergeJoin::addJoinedBlock(const Block & src_block) void MergeJoin::joinBlock(Block & block) { JoinCommon::checkTypesOfKeys(block, table_join->keyNamesLeft(), right_table_keys, table_join->keyNamesRight()); + materializeBlockInplace(block); + JoinCommon::removeLowCardinalityInplace(block); + sortBlock(block, left_sort_description); std::shared_lock lock(rwlock); @@ -277,6 +283,7 @@ void MergeJoin::joinBlock(Block & block) MutableColumns left_columns = makeMutableColumns(block, (is_all ? rows_to_reserve : 0)); MutableColumns right_columns = makeMutableColumns(right_columns_to_add, rows_to_reserve); MergeJoinCursor left_cursor(block, left_merge_description); + size_t left_key_tail = 0; if (is_left) { @@ -284,9 +291,10 @@ void MergeJoin::joinBlock(Block & block) { if (left_cursor.atEnd()) break; - leftJoin(left_cursor, block, *it, left_columns, right_columns); + leftJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail); } + left_cursor.nextN(left_key_tail); joinInequalsLeft(block, left_columns, right_columns, left_cursor.position(), left_cursor.end(), is_all); //left_cursor.nextN(left_cursor.end() - left_cursor.position()); @@ -299,16 +307,17 @@ void MergeJoin::joinBlock(Block & block) { if (left_cursor.atEnd()) break; - innerJoin(left_cursor, block, *it, left_columns, right_columns); + innerJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail); } + left_cursor.nextN(left_key_tail); changeLeftColumns(block, std::move(left_columns)); addRightColumns(block, std::move(right_columns)); } } void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns) + MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail) { MergeJoinCursor right_cursor(right_block, right_merge_description); @@ -331,13 +340,16 @@ void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block /// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) if (is_all && right_cursor.atEnd()) + { + left_key_tail = range.left_length; break; + } left_cursor.nextN(range.left_length); } } void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns) + MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail) { MergeJoinCursor right_cursor(right_block, right_merge_description); @@ -352,7 +364,10 @@ void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_bloc /// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block) if (is_all && right_cursor.atEnd()) + { + left_key_tail = range.left_length; break; + } left_cursor.nextN(range.left_length); } } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index dbc9191b70d..127fc20feca 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -49,9 +49,9 @@ private: void mergeRightBlocks(); void leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns); + MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail); void innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block, - MutableColumns & left_columns, MutableColumns & right_columns); + MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail); }; } diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.reference b/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.reference new file mode 100644 index 00000000000..43974985672 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.reference @@ -0,0 +1,5 @@ +1 1 +2 +3 +4 +5 diff --git a/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.sql b/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.sql new file mode 100644 index 00000000000..9f45f14a44b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_partial_merge_join_const_and_lc.sql @@ -0,0 +1,7 @@ +set partial_merge_join = 1; + +select s1.x, s2.x from (select 1 as x) s1 left join (select 1 as x) s2 using x; +select * from (select materialize(2) as x) s1 left join (select 2 as x) s2 using x; +select * from (select 3 as x) s1 left join (select materialize(3) as x) s2 using x; +select * from (select toLowCardinality(4) as x) s1 left join (select 4 as x) s2 using x; +select * from (select 5 as x) s1 left join (select toLowCardinality(5) as x) s2 using x; From f4b7cae998f721740f2247b90c33f3507e9c43fd Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Sep 2019 02:55:59 +0800 Subject: [PATCH 066/102] optimize limit 1,0 and test. --- dbms/src/Interpreters/InterpreterSelectQuery.cpp | 2 +- dbms/tests/queries/0_stateless/01012_select_limit_x_0.reference | 0 dbms/tests/queries/0_stateless/01012_select_limit_x_0.sql | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/01012_select_limit_x_0.reference create mode 100644 dbms/tests/queries/0_stateless/01012_select_limit_x_0.sql diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index 39a1976d2d4..1fb853810c3 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -790,7 +790,7 @@ static std::pair getLimitLengthAndOffset(const ASTSelectQuery & if (query.limitLength()) { length = getLimitUIntValue(query.limitLength(), context); - if (query.limitOffset()) + if (query.limitOffset() && length) offset = getLimitUIntValue(query.limitOffset(), context); } diff --git a/dbms/tests/queries/0_stateless/01012_select_limit_x_0.reference b/dbms/tests/queries/0_stateless/01012_select_limit_x_0.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01012_select_limit_x_0.sql b/dbms/tests/queries/0_stateless/01012_select_limit_x_0.sql new file mode 100644 index 00000000000..5a0549dea15 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01012_select_limit_x_0.sql @@ -0,0 +1 @@ +SELECT count() FROM system.numbers LIMIT 1, 0; From f682cbc31d59f361f89a5aacb796d19dea242141 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 17 Sep 2019 22:26:24 +0300 Subject: [PATCH 067/102] Add time logging to build script --- docker/packager/deb/Dockerfile | 4 +++- docker/packager/deb/build.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index 7ce98ba8fcc..b7aeac8cf07 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -80,7 +80,9 @@ RUN apt-get --allow-unauthenticated update -y \ libcapnp-dev \ cmake \ gdb \ - pigz + pigz \ + moreutils + # Special dpkg-deb (https://github.com/ClickHouse-Extras/dpkg) version which is able diff --git a/docker/packager/deb/build.sh b/docker/packager/deb/build.sh index 033e2c26464..6d5144266ae 100755 --- a/docker/packager/deb/build.sh +++ b/docker/packager/deb/build.sh @@ -4,7 +4,7 @@ set -x -e ccache --show-stats ||: ccache --zero-stats ||: -build/release --no-pbuilder $ALIEN_PKGS +build/release --no-pbuilder $ALIEN_PKGS | ts '%Y-%m-%d %H:%M:%S' mv /*.deb /output mv *.changes /output mv *.buildinfo /output From 780341060f193f6198b712e4566bdb5234f1461c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 18 Sep 2019 00:08:20 +0300 Subject: [PATCH 068/102] Replace libsparsehash with sparsehash-c11 - use sparsehash-c11 over libsparsehash - fix typos in find_sparsehash and users of the vars (s/SPARCE/SPARSE/) - drop libsparsehash-dev from docker images (but keep for unbunlded build) - use ::google over GOOGLE_NAMESPACE --- .gitmodules | 3 + cmake/find_sparsehash.cmake | 14 +- contrib/libsparsehash/AUTHORS | 2 - contrib/libsparsehash/COPYING | 28 - contrib/libsparsehash/NEWS | 188 -- contrib/libsparsehash/README | 143 -- .../libsparsehash/sparsehash/dense_hash_map | 369 ---- .../libsparsehash/sparsehash/dense_hash_set | 338 --- .../sparsehash/internal/densehashtable.h | 1319 ------------ .../sparsehash/internal/hashtable-common.h | 381 ---- .../internal/libc_allocator_with_realloc.h | 119 -- .../sparsehash/internal/sparseconfig.h | 46 - .../sparsehash/internal/sparsehashtable.h | 1247 ----------- .../libsparsehash/sparsehash/sparse_hash_map | 363 ---- .../libsparsehash/sparsehash/sparse_hash_set | 338 --- contrib/libsparsehash/sparsehash/sparsetable | 1820 ----------------- .../libsparsehash/sparsehash/template_util.h | 134 -- .../libsparsehash/sparsehash/type_traits.h | 342 ---- contrib/sparsehash-c11 | 1 + dbms/CMakeLists.txt | 3 +- dbms/src/Common/tests/CMakeLists.txt | 2 +- .../tests/integer_hash_tables_and_hashes.cpp | 4 +- dbms/src/Core/NamesAndTypes.cpp | 2 +- dbms/src/Core/tests/CMakeLists.txt | 2 +- dbms/src/Core/tests/string_pool.cpp | 4 +- dbms/src/Interpreters/tests/CMakeLists.txt | 10 +- dbms/src/Interpreters/tests/hash_map.cpp | 8 +- dbms/src/Storages/IStorage.cpp | 4 +- docker/packager/deb/Dockerfile | 2 - utils/build/build_debian.sh | 2 +- 30 files changed, 32 insertions(+), 7206 deletions(-) delete mode 100644 contrib/libsparsehash/AUTHORS delete mode 100644 contrib/libsparsehash/COPYING delete mode 100644 contrib/libsparsehash/NEWS delete mode 100644 contrib/libsparsehash/README delete mode 100644 contrib/libsparsehash/sparsehash/dense_hash_map delete mode 100644 contrib/libsparsehash/sparsehash/dense_hash_set delete mode 100644 contrib/libsparsehash/sparsehash/internal/densehashtable.h delete mode 100644 contrib/libsparsehash/sparsehash/internal/hashtable-common.h delete mode 100644 contrib/libsparsehash/sparsehash/internal/libc_allocator_with_realloc.h delete mode 100644 contrib/libsparsehash/sparsehash/internal/sparseconfig.h delete mode 100644 contrib/libsparsehash/sparsehash/internal/sparsehashtable.h delete mode 100644 contrib/libsparsehash/sparsehash/sparse_hash_map delete mode 100644 contrib/libsparsehash/sparsehash/sparse_hash_set delete mode 100644 contrib/libsparsehash/sparsehash/sparsetable delete mode 100644 contrib/libsparsehash/sparsehash/template_util.h delete mode 100644 contrib/libsparsehash/sparsehash/type_traits.h create mode 160000 contrib/sparsehash-c11 diff --git a/.gitmodules b/.gitmodules index e5be5438cc7..0b80743cadb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -103,3 +103,6 @@ [submodule "contrib/orc"] path = contrib/orc url = https://github.com/apache/orc +[submodule "contrib/sparsehash-c11"] + path = contrib/sparsehash-c11 + url = https://github.com/sparsehash/sparsehash-c11.git diff --git a/cmake/find_sparsehash.cmake b/cmake/find_sparsehash.cmake index 187401a27a9..d34ed8e048b 100644 --- a/cmake/find_sparsehash.cmake +++ b/cmake/find_sparsehash.cmake @@ -1,13 +1,13 @@ -option (USE_INTERNAL_SPARCEHASH_LIBRARY "Set to FALSE to use system sparsehash library instead of bundled" ${NOT_UNBUNDLED}) +option (USE_INTERNAL_SPARSEHASH_LIBRARY "Set to FALSE to use system sparsehash library instead of bundled" ${NOT_UNBUNDLED}) -if (NOT USE_INTERNAL_SPARCEHASH_LIBRARY) - find_path (SPARCEHASH_INCLUDE_DIR NAMES sparsehash/sparse_hash_map PATHS ${SPARCEHASH_INCLUDE_PATHS}) +if (NOT USE_INTERNAL_SPARSEHASH_LIBRARY) + find_path (SPARSEHASH_INCLUDE_DIR NAMES sparsehash/sparse_hash_map PATHS ${SPARSEHASH_INCLUDE_PATHS}) endif () -if (SPARCEHASH_INCLUDE_DIR) +if (SPARSEHASH_INCLUDE_DIR) else () - set (USE_INTERNAL_SPARCEHASH_LIBRARY 1) - set (SPARCEHASH_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libsparsehash") + set (USE_INTERNAL_SPARSEHASH_LIBRARY 1) + set (SPARSEHASH_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/sparsehash-c11") endif () -message (STATUS "Using sparsehash: ${SPARCEHASH_INCLUDE_DIR}") +message (STATUS "Using sparsehash: ${SPARSEHASH_INCLUDE_DIR}") diff --git a/contrib/libsparsehash/AUTHORS b/contrib/libsparsehash/AUTHORS deleted file mode 100644 index d8c24c64cad..00000000000 --- a/contrib/libsparsehash/AUTHORS +++ /dev/null @@ -1,2 +0,0 @@ -google-sparsehash@googlegroups.com - diff --git a/contrib/libsparsehash/COPYING b/contrib/libsparsehash/COPYING deleted file mode 100644 index e4956cfd9fd..00000000000 --- a/contrib/libsparsehash/COPYING +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2005, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/libsparsehash/NEWS b/contrib/libsparsehash/NEWS deleted file mode 100644 index 589c7090160..00000000000 --- a/contrib/libsparsehash/NEWS +++ /dev/null @@ -1,188 +0,0 @@ -== 23 Ferbruary 2012 == - -A backwards incompatibility arose from flattening the include headers -structure for the folder. - -This is now fixed in 2.0.2. You only need to upgrade if you had previously -included files from the folder. - -== 1 February 2012 == - -A minor bug related to the namespace switch from google to sparsehash -stopped the build from working when perftools is also installed. - -This is now fixed in 2.0.1. You only need to upgrade if you have perftools -installed. - -== 31 January 2012 == - -I've just released sparsehash 2.0. - -The `google-sparsehash` project has been renamed to `sparsehash`. I -(csilvers) am stepping down as maintainer, to be replaced by the team -of Donovan Hide and Geoff Pike. Welcome to the team, Donovan and -Geoff! Donovan has been an active contributor to sparsehash bug -reports and discussions in the past, and Geoff has been closely -involved with sparsehash inside Google (in addition to writing the -[http://code.google.com/p/cityhash CityHash hash function]). The two -of them together should be a formidable force. For good. - -I bumped the major version number up to 2 to reflect the new community -ownership of the project. All the -[http://sparsehash.googlecode.com/svn/tags/sparsehash-2.0/ChangeLog changes] -are related to the renaming. - -The only functional change from sparsehash 1.12 is that I've renamed -the `google/` include-directory to be `sparsehash/` instead. New code -should `#include `/etc. I've kept the old -names around as forwarding headers to the new, so `#include -` will continue to work. - -Note that the classes and functions remain in the `google` C++ -namespace (I didn't change that to `sparsehash` as well); I think -that's a trickier transition, and can happen in a future release. - - -=== 18 January 2011 === - -The `google-sparsehash` Google Code page has been renamed to -`sparsehash`, in preparation for the project being renamed to -`sparsehash`. In the coming weeks, I'll be stepping down as -maintainer for the sparsehash project, and as part of that Google is -relinquishing ownership of the project; it will now be entirely -community run. The name change reflects that shift. - - -=== 20 December 2011 === - -I've just released sparsehash 1.12. This release features improved -I/O (serialization) support. Support is finally added to serialize -and unserialize `dense_hash_map`/`set`, paralleling the existing code -for `sparse_hash_map`/`set`. In addition, the serialization API has -gotten simpler, with a single `serialize()` method to write to disk, -and an `unserialize()` method to read from disk. Finally, support has -gotten more generic, with built-in support for both C `FILE*`s and C++ -streams, and an extension mechanism to support arbitrary sources and -sinks. - -There are also more minor changes, including minor bugfixes, an -improved deleted-key test, and a minor addition to the `sparsetable` -API. See the [http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.12/ChangeLog ChangeLog] -for full details. - -=== 23 June 2011 === - -I've just released sparsehash 1.11. The major user-visible change is -that the default behavior is improved -- using the hash_map/set is -faster -- for hashtables where the key is a pointer. We now notice -that case and ignore the low 2-3 bits (which are almost always 0 for -pointers) when hashing. - -Another user-visible change is we've removed the tests for whether the -STL (vector, pair, etc) is defined in the 'std' namespace. gcc 2.95 -is the most recent compiler I know of to put STL types and functions -in the global namespace. If you need to use such an old compiler, do -not update to the latest sparsehash release. - -We've also changed the internal tools we use to integrate -Googler-supplied patches to sparsehash into the opensource release. -These new tools should result in more frequent updates with better -change descriptions. They will also result in future ChangeLog -entries being much more verbose (for better or for worse). - -A full list of changes is described in -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.11/ChangeLog ChangeLog]. - -=== 21 January 2011 === - -I've just released sparsehash 1.10. This fixes a performance -regression in sparsehash 1.8, where sparse_hash_map would copy -hashtable keys by value even when the key was explicitly a reference. -It also fixes compiler warnings from MSVC 10, which uses some c++0x -features that did not interact well with sparsehash. - -There is no reason to upgrade unless you use references for your -hashtable keys, or compile with MSVC 10. A full list of changes is -described in -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.10/ChangeLog ChangeLog]. - - -=== 24 September 2010 === - -I've just released sparsehash 1.9. This fixes a size regression in -sparsehash 1.8, where the new allocator would take up space in -`sparse_hash_map`, doubling the sparse_hash_map overhead (from 1-2 -bits per bucket to 3 or so). All users are encouraged to upgrade. - -This change also marks enums as being Plain Old Data, which can speed -up hashtables with enum keys and/or values. A full list of changes is -described in -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.9/ChangeLog ChangeLog]. - -=== 29 July 2010 === - -I've just released sparsehash 1.8. This includes improved support for -`Allocator`, including supporting the allocator constructor arg and -`get_allocator()` access method. - -To work around a bug in gcc 4.0.x, I've renamed the static variables -`HT_OCCUPANCY_FLT` and `HT_SHRINK_FLT` to `HT_OCCUPANCY_PCT` and -`HT_SHRINK_PCT`, and changed their type from float to int. This -should not be a user-visible change, since these variables are only -used in the internal hashtable classes (sparsehash clients should use -`max_load_factor()` and `min_load_factor()` instead of modifying these -static variables), but if you do access these constants, you will need -to change your code. - -Internally, the biggest change is a revamp of the test suite. It now -has more complete coverage, and a more capable timing tester. There -are other, more minor changes as well. A full list of changes is -described in the -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.8/ChangeLog ChangeLog]. - -=== 31 March 2010 === - -I've just released sparsehash 1.7. The major news here is the -addition of `Allocator` support. Previously, these hashtable classes -would just ignore the `Allocator` template parameter. They now -respect it, and even inherit `size_type`, `pointer`, etc. from the -allocator class. By default, they use a special allocator we provide -that uses libc `malloc` and `free` to allocate. The hash classes -notice when this special allocator is being used, and use `realloc` -when it can. This means that the default allocator is significantly -faster than custom allocators are likely to be (since realloc-like -functionality is not supported by STL allocators). - -There are a few more minor changes as well. A full list of changes is -described in the -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.7/ChangeLog ChangeLog]. - -=== 11 January 2010 === - -I've just released sparsehash 1.6. The API has widened a bit with the -addition of `deleted_key()` and `empty_key()`, which let you query -what values these keys have. A few rather obscure bugs have been -fixed (such as an error when copying one hashtable into another when -the empty_keys differ). A full list of changes is described in the -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.6/ChangeLog ChangeLog]. - -=== 9 May 2009 === - -I've just released sparsehash 1.5.1. Hot on the heels of sparsehash -1.5, this release fixes a longstanding bug in the sparsehash code, -where `equal_range` would always return an empty range. It now works -as documented. All sparsehash users are encouraged to upgrade. - -=== 7 May 2009 === - -I've just released sparsehash 1.5. This release introduces tr1 -compatibility: I've added `rehash`, `begin(i)`, and other methods that -are expected to be part of the `unordered_map` API once `tr1` in -introduced. This allows `sparse_hash_map`, `dense_hash_map`, -`sparse_hash_set`, and `dense_hash_set` to be (almost) drop-in -replacements for `unordered_map` and `unordered_set`. - -There is no need to upgrade unless you need this functionality, or -need one of the other, more minor, changes described in the -[http://google-sparsehash.googlecode.com/svn/tags/sparsehash-1.5/ChangeLog ChangeLog]. - diff --git a/contrib/libsparsehash/README b/contrib/libsparsehash/README deleted file mode 100644 index 527cfa1d304..00000000000 --- a/contrib/libsparsehash/README +++ /dev/null @@ -1,143 +0,0 @@ -This directory contains several hash-map implementations, similar in -API to SGI's hash_map class, but with different performance -characteristics. sparse_hash_map uses very little space overhead, 1-2 -bits per entry. dense_hash_map is very fast, particularly on lookup. -(sparse_hash_set and dense_hash_set are the set versions of these -routines.) On the other hand, these classes have requirements that -may not make them appropriate for all applications. - -All these implementation use a hashtable with internal quadratic -probing. This method is space-efficient -- there is no pointer -overhead -- and time-efficient for good hash functions. - -COMPILING ---------- -To compile test applications with these classes, run ./configure -followed by make. To install these header files on your system, run -'make install'. (On Windows, the instructions are different; see -README_windows.txt.) See INSTALL for more details. - -This code should work on any modern C++ system. It has been tested on -Linux (Ubuntu, Fedora, RedHat, Debian), Solaris 10 x86, FreeBSD 6.0, -OS X 10.3 and 10.4, and Windows under both VC++7 and VC++8. - -USING ------ -See the html files in the doc directory for small example programs -that use these classes. It's enough to just include the header file: - - #include // or sparse_hash_set, dense_hash_map, ... - google::sparse_hash_set number_mapper; - -and use the class the way you would other hash-map implementations. -(Though see "API" below for caveats.) - -By default (you can change it via a flag to ./configure), these hash -implementations are defined in the google namespace. - -API ---- -The API for sparse_hash_map, dense_hash_map, sparse_hash_set, and -dense_hash_set, are a superset of the API of SGI's hash_map class. -See doc/sparse_hash_map.html, et al., for more information about the -API. - -The usage of these classes differ from SGI's hash_map, and other -hashtable implementations, in the following major ways: - -1) dense_hash_map requires you to set aside one key value as the - 'empty bucket' value, set via the set_empty_key() method. This - *MUST* be called before you can use the dense_hash_map. It is - illegal to insert any elements into a dense_hash_map whose key is - equal to the empty-key. - -2) For both dense_hash_map and sparse_hash_map, if you wish to delete - elements from the hashtable, you must set aside a key value as the - 'deleted bucket' value, set via the set_deleted_key() method. If - your hash-map is insert-only, there is no need to call this - method. If you call set_deleted_key(), it is illegal to insert any - elements into a dense_hash_map or sparse_hash_map whose key is - equal to the deleted-key. - -3) These hash-map implementation support I/O. See below. - -There are also some smaller differences: - -1) The constructor takes an optional argument that specifies the - number of elements you expect to insert into the hashtable. This - differs from SGI's hash_map implementation, which takes an optional - number of buckets. - -2) erase() does not immediately reclaim memory. As a consequence, - erase() does not invalidate any iterators, making loops like this - correct: - for (it = ht.begin(); it != ht.end(); ++it) - if (...) ht.erase(it); - As another consequence, a series of erase() calls can leave your - hashtable using more memory than it needs to. The hashtable will - automatically compact at the next call to insert(), but to - manually compact a hashtable, you can call - ht.resize(0) - -I/O ---- -In addition to the normal hash-map operations, sparse_hash_map can -read and write hashtables to disk. (dense_hash_map also has the API, -but it has not yet been implemented, and writes will always fail.) - -In the simplest case, writing a hashtable is as easy as calling two -methods on the hashtable: - ht.write_metadata(fp); - ht.write_nopointer_data(fp); - -Reading in this data is equally simple: - google::sparse_hash_map<...> ht; - ht.read_metadata(fp); - ht.read_nopointer_data(fp); - -The above is sufficient if the key and value do not contain any -pointers: they are basic C types or agglomorations of basic C types. -If the key and/or value do contain pointers, you can still store the -hashtable by replacing write_nopointer_data() with a custom writing -routine. See sparse_hash_map.html et al. for more information. - -SPARSETABLE ------------ -In addition to the hash-map and hash-set classes, this package also -provides sparsetable.h, an array implementation that uses space -proportional to the number of elements in the array, rather than the -maximum element index. It uses very little space overhead: 1 bit per -entry. See doc/sparsetable.html for the API. - -RESOURCE USAGE --------------- -* sparse_hash_map has memory overhead of about 2 bits per hash-map - entry. -* dense_hash_map has a factor of 2-3 memory overhead: if your - hashtable data takes X bytes, dense_hash_map will use 3X-4X memory - total. - -Hashtables tend to double in size when resizing, creating an -additional 50% space overhead. dense_hash_map does in fact have a -significant "high water mark" memory use requirement. -sparse_hash_map, however, is written to need very little space -overhead when resizing: only a few bits per hashtable entry. - -PERFORMANCE ------------ -You can compile and run the included file time_hash_map.cc to examine -the performance of sparse_hash_map, dense_hash_map, and your native -hash_map implementation on your system. One test against the -SGI hash_map implementation gave the following timing information for -a simple find() call: - SGI hash_map: 22 ns - dense_hash_map: 13 ns - sparse_hash_map: 117 ns - SGI map: 113 ns - -See doc/performance.html for more detailed charts on resource usage -and performance data. - ---- -16 March 2005 -(Last updated: 12 September 2010) diff --git a/contrib/libsparsehash/sparsehash/dense_hash_map b/contrib/libsparsehash/sparsehash/dense_hash_map deleted file mode 100644 index 05fd580e644..00000000000 --- a/contrib/libsparsehash/sparsehash/dense_hash_map +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// ---- -// -// This is just a very thin wrapper over densehashtable.h, just -// like sgi stl's stl_hash_map is a very thin wrapper over -// stl_hashtable. The major thing we define is operator[], because -// we have a concept of a data_type which stl_hashtable doesn't -// (it only has a key and a value). -// -// NOTE: this is exactly like sparse_hash_map.h, with the word -// "sparse" replaced by "dense", except for the addition of -// set_empty_key(). -// -// YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION. -// -// Otherwise your program will die in mysterious ways. (Note if you -// use the constructor that takes an InputIterator range, you pass in -// the empty key in the constructor, rather than after. As a result, -// this constructor differs from the standard STL version.) -// -// In other respects, we adhere mostly to the STL semantics for -// hash-map. One important exception is that insert() may invalidate -// iterators entirely -- STL semantics are that insert() may reorder -// iterators, but they all still refer to something valid in the -// hashtable. Not so for us. Likewise, insert() may invalidate -// pointers into the hashtable. (Whether insert invalidates iterators -// and pointers depends on whether it results in a hashtable resize). -// On the plus side, delete() doesn't invalidate iterators or pointers -// at all, or even change the ordering of elements. -// -// Here are a few "power user" tips: -// -// 1) set_deleted_key(): -// If you want to use erase() you *must* call set_deleted_key(), -// in addition to set_empty_key(), after construction. -// The deleted and empty keys must differ. -// -// 2) resize(0): -// When an item is deleted, its memory isn't freed right -// away. This allows you to iterate over a hashtable, -// and call erase(), without invalidating the iterator. -// To force the memory to be freed, call resize(0). -// For tr1 compatibility, this can also be called as rehash(0). -// -// 3) min_load_factor(0.0) -// Setting the minimum load factor to 0.0 guarantees that -// the hash table will never shrink. -// -// Roughly speaking: -// (1) dense_hash_map: fastest, uses the most memory unless entries are small -// (2) sparse_hash_map: slowest, uses the least memory -// (3) hash_map / unordered_map (STL): in the middle -// -// Typically I use sparse_hash_map when I care about space and/or when -// I need to save the hashtable on disk. I use hash_map otherwise. I -// don't personally use dense_hash_set ever; some people use it for -// small sets with lots of lookups. -// -// - dense_hash_map has, typically, about 78% memory overhead (if your -// data takes up X bytes, the hash_map uses .78X more bytes in overhead). -// - sparse_hash_map has about 4 bits overhead per entry. -// - sparse_hash_map can be 3-7 times slower than the others for lookup and, -// especially, inserts. See time_hash_map.cc for details. -// -// See /usr/(local/)?doc/sparsehash-*/dense_hash_map.html -// for information about how to use this class. - -#ifndef _DENSE_HASH_MAP_H_ -#define _DENSE_HASH_MAP_H_ - -#include -#include // needed by stl_alloc -#include // for equal_to<>, select1st<>, etc -#include // for alloc -#include // for pair<> -#include // IWYU pragma: export -#include -#include HASH_FUN_H // for hash<> -_START_GOOGLE_NAMESPACE_ - -template , // defined in sparseconfig.h - class EqualKey = std::equal_to, - class Alloc = libc_allocator_with_realloc > > -class dense_hash_map { - private: - // Apparently select1st is not stl-standard, so we define our own - struct SelectKey { - typedef const Key& result_type; - const Key& operator()(const std::pair& p) const { - return p.first; - } - }; - struct SetKey { - void operator()(std::pair* value, const Key& new_key) const { - *const_cast(&value->first) = new_key; - // It would be nice to clear the rest of value here as well, in - // case it's taking up a lot of memory. We do this by clearing - // the value. This assumes T has a zero-arg constructor! - value->second = T(); - } - }; - // For operator[]. - struct DefaultValue { - std::pair operator()(const Key& key) { - return std::make_pair(key, T()); - } - }; - - // The actual data - typedef dense_hashtable, Key, HashFcn, SelectKey, - SetKey, EqualKey, Alloc> ht; - ht rep; - - public: - typedef typename ht::key_type key_type; - typedef T data_type; - typedef T mapped_type; - typedef typename ht::value_type value_type; - typedef typename ht::hasher hasher; - typedef typename ht::key_equal key_equal; - typedef Alloc allocator_type; - - typedef typename ht::size_type size_type; - typedef typename ht::difference_type difference_type; - typedef typename ht::pointer pointer; - typedef typename ht::const_pointer const_pointer; - typedef typename ht::reference reference; - typedef typename ht::const_reference const_reference; - - typedef typename ht::iterator iterator; - typedef typename ht::const_iterator const_iterator; - typedef typename ht::local_iterator local_iterator; - typedef typename ht::const_local_iterator const_local_iterator; - - // Iterator functions - iterator begin() { return rep.begin(); } - iterator end() { return rep.end(); } - const_iterator begin() const { return rep.begin(); } - const_iterator end() const { return rep.end(); } - - - // These come from tr1's unordered_map. For us, a bucket has 0 or 1 elements. - local_iterator begin(size_type i) { return rep.begin(i); } - local_iterator end(size_type i) { return rep.end(i); } - const_local_iterator begin(size_type i) const { return rep.begin(i); } - const_local_iterator end(size_type i) const { return rep.end(i); } - - // Accessor functions - allocator_type get_allocator() const { return rep.get_allocator(); } - hasher hash_funct() const { return rep.hash_funct(); } - hasher hash_function() const { return hash_funct(); } - key_equal key_eq() const { return rep.key_eq(); } - - - // Constructors - explicit dense_hash_map(size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(), alloc) { - } - - template - dense_hash_map(InputIterator f, InputIterator l, - const key_type& empty_key_val, - size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(), alloc) { - set_empty_key(empty_key_val); - rep.insert(f, l); - } - // We use the default copy constructor - // We use the default operator=() - // We use the default destructor - - void clear() { rep.clear(); } - // This clears the hash map without resizing it down to the minimum - // bucket count, but rather keeps the number of buckets constant - void clear_no_resize() { rep.clear_no_resize(); } - void swap(dense_hash_map& hs) { rep.swap(hs.rep); } - - - // Functions concerning size - size_type size() const { return rep.size(); } - size_type max_size() const { return rep.max_size(); } - bool empty() const { return rep.empty(); } - size_type bucket_count() const { return rep.bucket_count(); } - size_type max_bucket_count() const { return rep.max_bucket_count(); } - - // These are tr1 methods. bucket() is the bucket the key is or would be in. - size_type bucket_size(size_type i) const { return rep.bucket_size(i); } - size_type bucket(const key_type& key) const { return rep.bucket(key); } - float load_factor() const { - return size() * 1.0f / bucket_count(); - } - float max_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return grow; - } - void max_load_factor(float new_grow) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(shrink, new_grow); - } - // These aren't tr1 methods but perhaps ought to be. - float min_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return shrink; - } - void min_load_factor(float new_shrink) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(new_shrink, grow); - } - // Deprecated; use min_load_factor() or max_load_factor() instead. - void set_resizing_parameters(float shrink, float grow) { - rep.set_resizing_parameters(shrink, grow); - } - - void resize(size_type hint) { rep.resize(hint); } - void rehash(size_type hint) { resize(hint); } // the tr1 name - - // Lookup routines - iterator find(const key_type& key) { return rep.find(key); } - const_iterator find(const key_type& key) const { return rep.find(key); } - - data_type& operator[](const key_type& key) { // This is our value-add! - // If key is in the hashtable, returns find(key)->second, - // otherwise returns insert(value_type(key, T()).first->second. - // Note it does not create an empty T unless the find fails. - return rep.template find_or_insert(key).second; - } - - size_type count(const key_type& key) const { return rep.count(key); } - - std::pair equal_range(const key_type& key) { - return rep.equal_range(key); - } - std::pair equal_range(const key_type& key) - const { - return rep.equal_range(key); - } - - - // Insertion routines - std::pair insert(const value_type& obj) { - return rep.insert(obj); - } - template void insert(InputIterator f, InputIterator l) { - rep.insert(f, l); - } - void insert(const_iterator f, const_iterator l) { - rep.insert(f, l); - } - // Required for std::insert_iterator; the passed-in iterator is ignored. - iterator insert(iterator, const value_type& obj) { - return insert(obj).first; - } - - // Deletion and empty routines - // THESE ARE NON-STANDARD! I make you specify an "impossible" key - // value to identify deleted and empty buckets. You can change the - // deleted key as time goes on, or get rid of it entirely to be insert-only. - void set_empty_key(const key_type& key) { // YOU MUST CALL THIS! - rep.set_empty_key(value_type(key, data_type())); // rep wants a value - } - key_type empty_key() const { - return rep.empty_key().first; // rep returns a value - } - - void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } - void clear_deleted_key() { rep.clear_deleted_key(); } - key_type deleted_key() const { return rep.deleted_key(); } - - // These are standard - size_type erase(const key_type& key) { return rep.erase(key); } - void erase(iterator it) { rep.erase(it); } - void erase(iterator f, iterator l) { rep.erase(f, l); } - - - // Comparison - bool operator==(const dense_hash_map& hs) const { return rep == hs.rep; } - bool operator!=(const dense_hash_map& hs) const { return rep != hs.rep; } - - - // I/O -- this is an add-on for writing hash map to disk - // - // For maximum flexibility, this does not assume a particular - // file type (though it will probably be a FILE *). We just pass - // the fp through to rep. - - // If your keys and values are simple enough, you can pass this - // serializer to serialize()/unserialize(). "Simple enough" means - // value_type is a POD type that contains no pointers. Note, - // however, we don't try to normalize endianness. - typedef typename ht::NopointerSerializer NopointerSerializer; - - // serializer: a class providing operator()(OUTPUT*, const value_type&) - // (writing value_type to OUTPUT). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a - // pointer to a class providing size_t Write(const void*, size_t), - // which writes a buffer into a stream (which fp presumably - // owns) and returns the number of bytes successfully written. - // Note basic_ostream is not currently supported. - template - bool serialize(ValueSerializer serializer, OUTPUT* fp) { - return rep.serialize(serializer, fp); - } - - // serializer: a functor providing operator()(INPUT*, value_type*) - // (reading from INPUT and into value_type). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a - // pointer to a class providing size_t Read(void*, size_t), - // which reads into a buffer from a stream (which fp presumably - // owns) and returns the number of bytes successfully read. - // Note basic_istream is not currently supported. - // NOTE: Since value_type is std::pair, ValueSerializer - // may need to do a const cast in order to fill in the key. - template - bool unserialize(ValueSerializer serializer, INPUT* fp) { - return rep.unserialize(serializer, fp); - } -}; - -// We need a global swap as well -template -inline void swap(dense_hash_map& hm1, - dense_hash_map& hm2) { - hm1.swap(hm2); -} - -_END_GOOGLE_NAMESPACE_ - -#endif /* _DENSE_HASH_MAP_H_ */ diff --git a/contrib/libsparsehash/sparsehash/dense_hash_set b/contrib/libsparsehash/sparsehash/dense_hash_set deleted file mode 100644 index 050b15d1d5d..00000000000 --- a/contrib/libsparsehash/sparsehash/dense_hash_set +++ /dev/null @@ -1,338 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// This is just a very thin wrapper over densehashtable.h, just -// like sgi stl's stl_hash_set is a very thin wrapper over -// stl_hashtable. The major thing we define is operator[], because -// we have a concept of a data_type which stl_hashtable doesn't -// (it only has a key and a value). -// -// This is more different from dense_hash_map than you might think, -// because all iterators for sets are const (you obviously can't -// change the key, and for sets there is no value). -// -// NOTE: this is exactly like sparse_hash_set.h, with the word -// "sparse" replaced by "dense", except for the addition of -// set_empty_key(). -// -// YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION. -// -// Otherwise your program will die in mysterious ways. (Note if you -// use the constructor that takes an InputIterator range, you pass in -// the empty key in the constructor, rather than after. As a result, -// this constructor differs from the standard STL version.) -// -// In other respects, we adhere mostly to the STL semantics for -// hash-map. One important exception is that insert() may invalidate -// iterators entirely -- STL semantics are that insert() may reorder -// iterators, but they all still refer to something valid in the -// hashtable. Not so for us. Likewise, insert() may invalidate -// pointers into the hashtable. (Whether insert invalidates iterators -// and pointers depends on whether it results in a hashtable resize). -// On the plus side, delete() doesn't invalidate iterators or pointers -// at all, or even change the ordering of elements. -// -// Here are a few "power user" tips: -// -// 1) set_deleted_key(): -// If you want to use erase() you must call set_deleted_key(), -// in addition to set_empty_key(), after construction. -// The deleted and empty keys must differ. -// -// 2) resize(0): -// When an item is deleted, its memory isn't freed right -// away. This allows you to iterate over a hashtable, -// and call erase(), without invalidating the iterator. -// To force the memory to be freed, call resize(0). -// For tr1 compatibility, this can also be called as rehash(0). -// -// 3) min_load_factor(0.0) -// Setting the minimum load factor to 0.0 guarantees that -// the hash table will never shrink. -// -// Roughly speaking: -// (1) dense_hash_set: fastest, uses the most memory unless entries are small -// (2) sparse_hash_set: slowest, uses the least memory -// (3) hash_set / unordered_set (STL): in the middle -// -// Typically I use sparse_hash_set when I care about space and/or when -// I need to save the hashtable on disk. I use hash_set otherwise. I -// don't personally use dense_hash_set ever; some people use it for -// small sets with lots of lookups. -// -// - dense_hash_set has, typically, about 78% memory overhead (if your -// data takes up X bytes, the hash_set uses .78X more bytes in overhead). -// - sparse_hash_set has about 4 bits overhead per entry. -// - sparse_hash_set can be 3-7 times slower than the others for lookup and, -// especially, inserts. See time_hash_map.cc for details. -// -// See /usr/(local/)?doc/sparsehash-*/dense_hash_set.html -// for information about how to use this class. - -#ifndef _DENSE_HASH_SET_H_ -#define _DENSE_HASH_SET_H_ - -#include -#include // needed by stl_alloc -#include // for equal_to<>, select1st<>, etc -#include // for alloc -#include // for pair<> -#include // IWYU pragma: export -#include -#include HASH_FUN_H // for hash<> -_START_GOOGLE_NAMESPACE_ - -template , // defined in sparseconfig.h - class EqualKey = std::equal_to, - class Alloc = libc_allocator_with_realloc > -class dense_hash_set { - private: - // Apparently identity is not stl-standard, so we define our own - struct Identity { - typedef const Value& result_type; - const Value& operator()(const Value& v) const { return v; } - }; - struct SetKey { - void operator()(Value* value, const Value& new_key) const { - *value = new_key; - } - }; - - // The actual data - typedef dense_hashtable ht; - ht rep; - - public: - typedef typename ht::key_type key_type; - typedef typename ht::value_type value_type; - typedef typename ht::hasher hasher; - typedef typename ht::key_equal key_equal; - typedef Alloc allocator_type; - - typedef typename ht::size_type size_type; - typedef typename ht::difference_type difference_type; - typedef typename ht::const_pointer pointer; - typedef typename ht::const_pointer const_pointer; - typedef typename ht::const_reference reference; - typedef typename ht::const_reference const_reference; - - typedef typename ht::const_iterator iterator; - typedef typename ht::const_iterator const_iterator; - typedef typename ht::const_local_iterator local_iterator; - typedef typename ht::const_local_iterator const_local_iterator; - - - // Iterator functions -- recall all iterators are const - iterator begin() const { return rep.begin(); } - iterator end() const { return rep.end(); } - - // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements. - local_iterator begin(size_type i) const { return rep.begin(i); } - local_iterator end(size_type i) const { return rep.end(i); } - - - // Accessor functions - allocator_type get_allocator() const { return rep.get_allocator(); } - hasher hash_funct() const { return rep.hash_funct(); } - hasher hash_function() const { return hash_funct(); } // tr1 name - key_equal key_eq() const { return rep.key_eq(); } - - - // Constructors - explicit dense_hash_set(size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { - } - - template - dense_hash_set(InputIterator f, InputIterator l, - const key_type& empty_key_val, - size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { - set_empty_key(empty_key_val); - rep.insert(f, l); - } - // We use the default copy constructor - // We use the default operator=() - // We use the default destructor - - void clear() { rep.clear(); } - // This clears the hash set without resizing it down to the minimum - // bucket count, but rather keeps the number of buckets constant - void clear_no_resize() { rep.clear_no_resize(); } - void swap(dense_hash_set& hs) { rep.swap(hs.rep); } - - - // Functions concerning size - size_type size() const { return rep.size(); } - size_type max_size() const { return rep.max_size(); } - bool empty() const { return rep.empty(); } - size_type bucket_count() const { return rep.bucket_count(); } - size_type max_bucket_count() const { return rep.max_bucket_count(); } - - // These are tr1 methods. bucket() is the bucket the key is or would be in. - size_type bucket_size(size_type i) const { return rep.bucket_size(i); } - size_type bucket(const key_type& key) const { return rep.bucket(key); } - float load_factor() const { - return size() * 1.0f / bucket_count(); - } - float max_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return grow; - } - void max_load_factor(float new_grow) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(shrink, new_grow); - } - // These aren't tr1 methods but perhaps ought to be. - float min_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return shrink; - } - void min_load_factor(float new_shrink) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(new_shrink, grow); - } - // Deprecated; use min_load_factor() or max_load_factor() instead. - void set_resizing_parameters(float shrink, float grow) { - rep.set_resizing_parameters(shrink, grow); - } - - void resize(size_type hint) { rep.resize(hint); } - void rehash(size_type hint) { resize(hint); } // the tr1 name - - // Lookup routines - iterator find(const key_type& key) const { return rep.find(key); } - - size_type count(const key_type& key) const { return rep.count(key); } - - std::pair equal_range(const key_type& key) const { - return rep.equal_range(key); - } - - - // Insertion routines - std::pair insert(const value_type& obj) { - std::pair p = rep.insert(obj); - return std::pair(p.first, p.second); // const to non-const - } - template void insert(InputIterator f, InputIterator l) { - rep.insert(f, l); - } - void insert(const_iterator f, const_iterator l) { - rep.insert(f, l); - } - // Required for std::insert_iterator; the passed-in iterator is ignored. - iterator insert(iterator, const value_type& obj) { - return insert(obj).first; - } - - // Deletion and empty routines - // THESE ARE NON-STANDARD! I make you specify an "impossible" key - // value to identify deleted and empty buckets. You can change the - // deleted key as time goes on, or get rid of it entirely to be insert-only. - void set_empty_key(const key_type& key) { rep.set_empty_key(key); } - key_type empty_key() const { return rep.empty_key(); } - - void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } - void clear_deleted_key() { rep.clear_deleted_key(); } - key_type deleted_key() const { return rep.deleted_key(); } - - // These are standard - size_type erase(const key_type& key) { return rep.erase(key); } - void erase(iterator it) { rep.erase(it); } - void erase(iterator f, iterator l) { rep.erase(f, l); } - - - // Comparison - bool operator==(const dense_hash_set& hs) const { return rep == hs.rep; } - bool operator!=(const dense_hash_set& hs) const { return rep != hs.rep; } - - - // I/O -- this is an add-on for writing metainformation to disk - // - // For maximum flexibility, this does not assume a particular - // file type (though it will probably be a FILE *). We just pass - // the fp through to rep. - - // If your keys and values are simple enough, you can pass this - // serializer to serialize()/unserialize(). "Simple enough" means - // value_type is a POD type that contains no pointers. Note, - // however, we don't try to normalize endianness. - typedef typename ht::NopointerSerializer NopointerSerializer; - - // serializer: a class providing operator()(OUTPUT*, const value_type&) - // (writing value_type to OUTPUT). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a - // pointer to a class providing size_t Write(const void*, size_t), - // which writes a buffer into a stream (which fp presumably - // owns) and returns the number of bytes successfully written. - // Note basic_ostream is not currently supported. - template - bool serialize(ValueSerializer serializer, OUTPUT* fp) { - return rep.serialize(serializer, fp); - } - - // serializer: a functor providing operator()(INPUT*, value_type*) - // (reading from INPUT and into value_type). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a - // pointer to a class providing size_t Read(void*, size_t), - // which reads into a buffer from a stream (which fp presumably - // owns) and returns the number of bytes successfully read. - // Note basic_istream is not currently supported. - template - bool unserialize(ValueSerializer serializer, INPUT* fp) { - return rep.unserialize(serializer, fp); - } -}; - -template -inline void swap(dense_hash_set& hs1, - dense_hash_set& hs2) { - hs1.swap(hs2); -} - -_END_GOOGLE_NAMESPACE_ - -#endif /* _DENSE_HASH_SET_H_ */ diff --git a/contrib/libsparsehash/sparsehash/internal/densehashtable.h b/contrib/libsparsehash/sparsehash/internal/densehashtable.h deleted file mode 100644 index 1f0c943afbe..00000000000 --- a/contrib/libsparsehash/sparsehash/internal/densehashtable.h +++ /dev/null @@ -1,1319 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// A dense hashtable is a particular implementation of -// a hashtable: one that is meant to minimize memory allocation. -// It does this by using an array to store all the data. We -// steal a value from the key space to indicate "empty" array -// elements (ie indices where no item lives) and another to indicate -// "deleted" elements. -// -// (Note it is possible to change the value of the delete key -// on the fly; you can even remove it, though after that point -// the hashtable is insert_only until you set it again. The empty -// value however can't be changed.) -// -// To minimize allocation and pointer overhead, we use internal -// probing, in which the hashtable is a single table, and collisions -// are resolved by trying to insert again in another bucket. The -// most cache-efficient internal probing schemes are linear probing -// (which suffers, alas, from clumping) and quadratic probing, which -// is what we implement by default. -// -// Type requirements: value_type is required to be Copy Constructible -// and Default Constructible. It is not required to be (and commonly -// isn't) Assignable. -// -// You probably shouldn't use this code directly. Use dense_hash_map<> -// or dense_hash_set<> instead. - -// You can change the following below: -// HT_OCCUPANCY_PCT -- how full before we double size -// HT_EMPTY_PCT -- how empty before we halve size -// HT_MIN_BUCKETS -- default smallest bucket size -// -// You can also change enlarge_factor (which defaults to -// HT_OCCUPANCY_PCT), and shrink_factor (which defaults to -// HT_EMPTY_PCT) with set_resizing_parameters(). -// -// How to decide what values to use? -// shrink_factor's default of .4 * OCCUPANCY_PCT, is probably good. -// HT_MIN_BUCKETS is probably unnecessary since you can specify -// (indirectly) the starting number of buckets at construct-time. -// For enlarge_factor, you can use this chart to try to trade-off -// expected lookup time to the space taken up. By default, this -// code uses quadratic probing, though you can change it to linear -// via JUMP_ below if you really want to. -// -// From http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html -// NUMBER OF PROBES / LOOKUP Successful Unsuccessful -// Quadratic collision resolution 1 - ln(1-L) - L/2 1/(1-L) - L - ln(1-L) -// Linear collision resolution [1+1/(1-L)]/2 [1+1/(1-L)2]/2 -// -// -- enlarge_factor -- 0.10 0.50 0.60 0.75 0.80 0.90 0.99 -// QUADRATIC COLLISION RES. -// probes/successful lookup 1.05 1.44 1.62 2.01 2.21 2.85 5.11 -// probes/unsuccessful lookup 1.11 2.19 2.82 4.64 5.81 11.4 103.6 -// LINEAR COLLISION RES. -// probes/successful lookup 1.06 1.5 1.75 2.5 3.0 5.5 50.5 -// probes/unsuccessful lookup 1.12 2.5 3.6 8.5 13.0 50.0 5000.0 - -#ifndef _DENSEHASHTABLE_H_ -#define _DENSEHASHTABLE_H_ - -#include -#include -#include // for FILE, fwrite, fread -#include // For swap(), eg -#include // For iterator tags -#include // for numeric_limits -#include // For uninitialized_fill -#include // for pair -#include -#include -#include -#include // For length_error - -_START_GOOGLE_NAMESPACE_ - -namespace base { // just to make google->opensource transition easier -using GOOGLE_NAMESPACE::true_type; -using GOOGLE_NAMESPACE::false_type; -using GOOGLE_NAMESPACE::integral_constant; -using GOOGLE_NAMESPACE::is_same; -using GOOGLE_NAMESPACE::remove_const; -} - -// The probing method -// Linear probing -// #define JUMP_(key, num_probes) ( 1 ) -// Quadratic probing -#define JUMP_(key, num_probes) ( num_probes ) - -// Hashtable class, used to implement the hashed associative containers -// hash_set and hash_map. - -// Value: what is stored in the table (each bucket is a Value). -// Key: something in a 1-to-1 correspondence to a Value, that can be used -// to search for a Value in the table (find() takes a Key). -// HashFcn: Takes a Key and returns an integer, the more unique the better. -// ExtractKey: given a Value, returns the unique Key associated with it. -// Must inherit from unary_function, or at least have a -// result_type enum indicating the return type of operator(). -// SetKey: given a Value* and a Key, modifies the value such that -// ExtractKey(value) == key. We guarantee this is only called -// with key == deleted_key or key == empty_key. -// EqualKey: Given two Keys, says whether they are the same (that is, -// if they are both associated with the same Value). -// Alloc: STL allocator to use to allocate memory. - -template -class dense_hashtable; - -template -struct dense_hashtable_iterator; - -template -struct dense_hashtable_const_iterator; - -// We're just an array, but we need to skip over empty and deleted elements -template -struct dense_hashtable_iterator { - private: - typedef typename A::template rebind::other value_alloc_type; - - public: - typedef dense_hashtable_iterator iterator; - typedef dense_hashtable_const_iterator const_iterator; - - typedef std::forward_iterator_tag iterator_category; // very little defined! - typedef V value_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::pointer pointer; - - // "Real" constructor and default constructor - dense_hashtable_iterator(const dense_hashtable *h, - pointer it, pointer it_end, bool advance) - : ht(h), pos(it), end(it_end) { - if (advance) advance_past_empty_and_deleted(); - } - dense_hashtable_iterator() { } - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *pos; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic. The only hard part is making sure that - // we're not on an empty or marked-deleted array element - void advance_past_empty_and_deleted() { - while ( pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)) ) - ++pos; - } - iterator& operator++() { - assert(pos != end); ++pos; advance_past_empty_and_deleted(); return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - - // Comparison. - bool operator==(const iterator& it) const { return pos == it.pos; } - bool operator!=(const iterator& it) const { return pos != it.pos; } - - - // The actual data - const dense_hashtable *ht; - pointer pos, end; -}; - - -// Now do it all again, but with const-ness! -template -struct dense_hashtable_const_iterator { - private: - typedef typename A::template rebind::other value_alloc_type; - - public: - typedef dense_hashtable_iterator iterator; - typedef dense_hashtable_const_iterator const_iterator; - - typedef std::forward_iterator_tag iterator_category; // very little defined! - typedef V value_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::const_reference reference; - typedef typename value_alloc_type::const_pointer pointer; - - // "Real" constructor and default constructor - dense_hashtable_const_iterator( - const dense_hashtable *h, - pointer it, pointer it_end, bool advance) - : ht(h), pos(it), end(it_end) { - if (advance) advance_past_empty_and_deleted(); - } - dense_hashtable_const_iterator() - : ht(NULL), pos(pointer()), end(pointer()) { } - // This lets us convert regular iterators to const iterators - dense_hashtable_const_iterator(const iterator &it) - : ht(it.ht), pos(it.pos), end(it.end) { } - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *pos; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic. The only hard part is making sure that - // we're not on an empty or marked-deleted array element - void advance_past_empty_and_deleted() { - while ( pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)) ) - ++pos; - } - const_iterator& operator++() { - assert(pos != end); ++pos; advance_past_empty_and_deleted(); return *this; - } - const_iterator operator++(int) { const_iterator tmp(*this); ++*this; return tmp; } - - // Comparison. - bool operator==(const const_iterator& it) const { return pos == it.pos; } - bool operator!=(const const_iterator& it) const { return pos != it.pos; } - - - // The actual data - const dense_hashtable *ht; - pointer pos, end; -}; - -template -class dense_hashtable { - private: - typedef typename Alloc::template rebind::other value_alloc_type; - - public: - typedef Key key_type; - typedef Value value_type; - typedef HashFcn hasher; - typedef EqualKey key_equal; - typedef Alloc allocator_type; - - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::const_reference const_reference; - typedef typename value_alloc_type::pointer pointer; - typedef typename value_alloc_type::const_pointer const_pointer; - typedef dense_hashtable_iterator - iterator; - - typedef dense_hashtable_const_iterator - const_iterator; - - // These come from tr1. For us they're the same as regular iterators. - typedef iterator local_iterator; - typedef const_iterator const_local_iterator; - - // How full we let the table get before we resize, by default. - // Knuth says .8 is good -- higher causes us to probe too much, - // though it saves memory. - static const int HT_OCCUPANCY_PCT; // defined at the bottom of this file - - // How empty we let the table get before we resize lower, by default. - // (0.0 means never resize lower.) - // It should be less than OCCUPANCY_PCT / 2 or we thrash resizing - static const int HT_EMPTY_PCT; // defined at the bottom of this file - - // Minimum size we're willing to let hashtables be. - // Must be a power of two, and at least 4. - // Note, however, that for a given hashtable, the initial size is a - // function of the first constructor arg, and may be >HT_MIN_BUCKETS. - static const size_type HT_MIN_BUCKETS = 4; - - // By default, if you don't specify a hashtable size at - // construction-time, we use this size. Must be a power of two, and - // at least HT_MIN_BUCKETS. - static const size_type HT_DEFAULT_STARTING_BUCKETS = 32; - - // ITERATOR FUNCTIONS - iterator begin() { return iterator(this, table, - table + num_buckets, true); } - iterator end() { return iterator(this, table + num_buckets, - table + num_buckets, true); } - const_iterator begin() const { return const_iterator(this, table, - table+num_buckets,true);} - const_iterator end() const { return const_iterator(this, table + num_buckets, - table+num_buckets,true);} - - // These come from tr1 unordered_map. They iterate over 'bucket' n. - // We'll just consider bucket n to be the n-th element of the table. - local_iterator begin(size_type i) { - return local_iterator(this, table + i, table + i+1, false); - } - local_iterator end(size_type i) { - local_iterator it = begin(i); - if (!test_empty(i) && !test_deleted(i)) - ++it; - return it; - } - const_local_iterator begin(size_type i) const { - return const_local_iterator(this, table + i, table + i+1, false); - } - const_local_iterator end(size_type i) const { - const_local_iterator it = begin(i); - if (!test_empty(i) && !test_deleted(i)) - ++it; - return it; - } - - // ACCESSOR FUNCTIONS for the things we templatize on, basically - hasher hash_funct() const { return settings; } - key_equal key_eq() const { return key_info; } - allocator_type get_allocator() const { - return allocator_type(val_info); - } - - // Accessor function for statistics gathering. - int num_table_copies() const { return settings.num_ht_copies(); } - - private: - // Annoyingly, we can't copy values around, because they might have - // const components (they're probably pair). We use - // explicit destructor invocation and placement new to get around - // this. Arg. - void set_value(pointer dst, const_reference src) { - dst->~value_type(); // delete the old value, if any - new(dst) value_type(src); - } - - void destroy_buckets(size_type first, size_type last) { - for ( ; first != last; ++first) - table[first].~value_type(); - } - - // DELETE HELPER FUNCTIONS - // This lets the user describe a key that will indicate deleted - // table entries. This key should be an "impossible" entry -- - // if you try to insert it for real, you won't be able to retrieve it! - // (NB: while you pass in an entire value, only the key part is looked - // at. This is just because I don't know how to assign just a key.) - private: - void squash_deleted() { // gets rid of any deleted entries we have - if ( num_deleted ) { // get rid of deleted before writing - dense_hashtable tmp(*this); // copying will get rid of deleted - swap(tmp); // now we are tmp - } - assert(num_deleted == 0); - } - - // Test if the given key is the deleted indicator. Requires - // num_deleted > 0, for correctness of read(), and because that - // guarantees that key_info.delkey is valid. - bool test_deleted_key(const key_type& key) const { - assert(num_deleted > 0); - return equals(key_info.delkey, key); - } - - public: - void set_deleted_key(const key_type &key) { - // the empty indicator (if specified) and the deleted indicator - // must be different - assert((!settings.use_empty() || !equals(key, get_key(val_info.emptyval))) - && "Passed the empty-key to set_deleted_key"); - // It's only safe to change what "deleted" means if we purge deleted guys - squash_deleted(); - settings.set_use_deleted(true); - key_info.delkey = key; - } - void clear_deleted_key() { - squash_deleted(); - settings.set_use_deleted(false); - } - key_type deleted_key() const { - assert(settings.use_deleted() - && "Must set deleted key before calling deleted_key"); - return key_info.delkey; - } - - // These are public so the iterators can use them - // True if the item at position bucknum is "deleted" marker - bool test_deleted(size_type bucknum) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(table[bucknum])); - } - bool test_deleted(const iterator &it) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(*it)); - } - bool test_deleted(const const_iterator &it) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(*it)); - } - - private: - void check_use_deleted(const char* caller) { - (void)caller; // could log it if the assert failed - assert(settings.use_deleted()); - } - - // Set it so test_deleted is true. true if object didn't used to be deleted. - bool set_deleted(iterator &it) { - check_use_deleted("set_deleted()"); - bool retval = !test_deleted(it); - // &* converts from iterator to value-type. - set_key(&(*it), key_info.delkey); - return retval; - } - // Set it so test_deleted is false. true if object used to be deleted. - bool clear_deleted(iterator &it) { - check_use_deleted("clear_deleted()"); - // Happens automatically when we assign something else in its place. - return test_deleted(it); - } - - // We also allow to set/clear the deleted bit on a const iterator. - // We allow a const_iterator for the same reason you can delete a - // const pointer: it's convenient, and semantically you can't use - // 'it' after it's been deleted anyway, so its const-ness doesn't - // really matter. - bool set_deleted(const_iterator &it) { - check_use_deleted("set_deleted()"); - bool retval = !test_deleted(it); - set_key(const_cast(&(*it)), key_info.delkey); - return retval; - } - // Set it so test_deleted is false. true if object used to be deleted. - bool clear_deleted(const_iterator &it) { - check_use_deleted("clear_deleted()"); - return test_deleted(it); - } - - // EMPTY HELPER FUNCTIONS - // This lets the user describe a key that will indicate empty (unused) - // table entries. This key should be an "impossible" entry -- - // if you try to insert it for real, you won't be able to retrieve it! - // (NB: while you pass in an entire value, only the key part is looked - // at. This is just because I don't know how to assign just a key.) - public: - // These are public so the iterators can use them - // True if the item at position bucknum is "empty" marker - bool test_empty(size_type bucknum) const { - assert(settings.use_empty()); // we always need to know what's empty! - return equals(get_key(val_info.emptyval), get_key(table[bucknum])); - } - bool test_empty(const iterator &it) const { - assert(settings.use_empty()); // we always need to know what's empty! - return equals(get_key(val_info.emptyval), get_key(*it)); - } - bool test_empty(const const_iterator &it) const { - assert(settings.use_empty()); // we always need to know what's empty! - return equals(get_key(val_info.emptyval), get_key(*it)); - } - - private: - void fill_range_with_empty(pointer table_start, pointer table_end) { - std::uninitialized_fill(table_start, table_end, val_info.emptyval); - } - - public: - // TODO(csilvers): change all callers of this to pass in a key instead, - // and take a const key_type instead of const value_type. - void set_empty_key(const_reference val) { - // Once you set the empty key, you can't change it - assert(!settings.use_empty() && "Calling set_empty_key multiple times"); - // The deleted indicator (if specified) and the empty indicator - // must be different. - assert((!settings.use_deleted() || !equals(get_key(val), key_info.delkey)) - && "Setting the empty key the same as the deleted key"); - settings.set_use_empty(true); - set_value(&val_info.emptyval, val); - - assert(!table); // must set before first use - // num_buckets was set in constructor even though table was NULL - table = val_info.allocate(num_buckets); - assert(table); - fill_range_with_empty(table, table + num_buckets); - } - // TODO(user): return a key_type rather than a value_type - value_type empty_key() const { - assert(settings.use_empty()); - return val_info.emptyval; - } - - // FUNCTIONS CONCERNING SIZE - public: - size_type size() const { return num_elements - num_deleted; } - size_type max_size() const { return val_info.max_size(); } - bool empty() const { return size() == 0; } - size_type bucket_count() const { return num_buckets; } - size_type max_bucket_count() const { return max_size(); } - size_type nonempty_bucket_count() const { return num_elements; } - // These are tr1 methods. Their idea of 'bucket' doesn't map well to - // what we do. We just say every bucket has 0 or 1 items in it. - size_type bucket_size(size_type i) const { - return begin(i) == end(i) ? 0 : 1; - } - - private: - // Because of the above, size_type(-1) is never legal; use it for errors - static const size_type ILLEGAL_BUCKET = size_type(-1); - - // Used after a string of deletes. Returns true if we actually shrunk. - // TODO(csilvers): take a delta so we can take into account inserts - // done after shrinking. Maybe make part of the Settings class? - bool maybe_shrink() { - assert(num_elements >= num_deleted); - assert((bucket_count() & (bucket_count()-1)) == 0); // is a power of two - assert(bucket_count() >= HT_MIN_BUCKETS); - bool retval = false; - - // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS, - // we'll never shrink until you get relatively big, and we'll never - // shrink below HT_DEFAULT_STARTING_BUCKETS. Otherwise, something - // like "dense_hash_set x; x.insert(4); x.erase(4);" will - // shrink us down to HT_MIN_BUCKETS buckets, which is too small. - const size_type num_remain = num_elements - num_deleted; - const size_type shrink_threshold = settings.shrink_threshold(); - if (shrink_threshold > 0 && num_remain < shrink_threshold && - bucket_count() > HT_DEFAULT_STARTING_BUCKETS) { - const float shrink_factor = settings.shrink_factor(); - size_type sz = bucket_count() / 2; // find how much we should shrink - while (sz > HT_DEFAULT_STARTING_BUCKETS && - num_remain < sz * shrink_factor) { - sz /= 2; // stay a power of 2 - } - dense_hashtable tmp(*this, sz); // Do the actual resizing - swap(tmp); // now we are tmp - retval = true; - } - settings.set_consider_shrink(false); // because we just considered it - return retval; - } - - // We'll let you resize a hashtable -- though this makes us copy all! - // When you resize, you say, "make it big enough for this many more elements" - // Returns true if we actually resized, false if size was already ok. - bool resize_delta(size_type delta) { - bool did_resize = false; - if ( settings.consider_shrink() ) { // see if lots of deletes happened - if ( maybe_shrink() ) - did_resize = true; - } - if (num_elements >= - (std::numeric_limits::max)() - delta) { - throw std::length_error("resize overflow"); - } - if ( bucket_count() >= HT_MIN_BUCKETS && - (num_elements + delta) <= settings.enlarge_threshold() ) - return did_resize; // we're ok as we are - - // Sometimes, we need to resize just to get rid of all the - // "deleted" buckets that are clogging up the hashtable. So when - // deciding whether to resize, count the deleted buckets (which - // are currently taking up room). But later, when we decide what - // size to resize to, *don't* count deleted buckets, since they - // get discarded during the resize. - const size_type needed_size = settings.min_buckets(num_elements + delta, 0); - if ( needed_size <= bucket_count() ) // we have enough buckets - return did_resize; - - size_type resize_to = - settings.min_buckets(num_elements - num_deleted + delta, bucket_count()); - - if (resize_to < needed_size && // may double resize_to - resize_to < (std::numeric_limits::max)() / 2) { - // This situation means that we have enough deleted elements, - // that once we purge them, we won't actually have needed to - // grow. But we may want to grow anyway: if we just purge one - // element, say, we'll have to grow anyway next time we - // insert. Might as well grow now, since we're already going - // through the trouble of copying (in order to purge the - // deleted elements). - const size_type target = - static_cast(settings.shrink_size(resize_to*2)); - if (num_elements - num_deleted + delta >= target) { - // Good, we won't be below the shrink threshhold even if we double. - resize_to *= 2; - } - } - dense_hashtable tmp(*this, resize_to); - swap(tmp); // now we are tmp - return true; - } - - // We require table be not-NULL and empty before calling this. - void resize_table(size_type /*old_size*/, size_type new_size, - base::true_type) { - table = val_info.realloc_or_die(table, new_size); - } - - void resize_table(size_type old_size, size_type new_size, base::false_type) { - val_info.deallocate(table, old_size); - table = val_info.allocate(new_size); - } - - // Used to actually do the rehashing when we grow/shrink a hashtable - void copy_from(const dense_hashtable &ht, size_type min_buckets_wanted) { - clear_to_size(settings.min_buckets(ht.size(), min_buckets_wanted)); - - // We use a normal iterator to get non-deleted bcks from ht - // We could use insert() here, but since we know there are - // no duplicates and no deleted items, we can be more efficient - assert((bucket_count() & (bucket_count()-1)) == 0); // a power of two - for ( const_iterator it = ht.begin(); it != ht.end(); ++it ) { - size_type num_probes = 0; // how many times we've probed - size_type bucknum; - const size_type bucket_count_minus_one = bucket_count() - 1; - for (bucknum = hash(get_key(*it)) & bucket_count_minus_one; - !test_empty(bucknum); // not empty - bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) { - ++num_probes; - assert(num_probes < bucket_count() - && "Hashtable is full: an error in key_equal<> or hash<>"); - } - set_value(&table[bucknum], *it); // copies the value to here - num_elements++; - } - settings.inc_num_ht_copies(); - } - - // Required by the spec for hashed associative container - public: - // Though the docs say this should be num_buckets, I think it's much - // more useful as num_elements. As a special feature, calling with - // req_elements==0 will cause us to shrink if we can, saving space. - void resize(size_type req_elements) { // resize to this or larger - if ( settings.consider_shrink() || req_elements == 0 ) - maybe_shrink(); - if ( req_elements > num_elements ) - resize_delta(req_elements - num_elements); - } - - // Get and change the value of shrink_factor and enlarge_factor. The - // description at the beginning of this file explains how to choose - // the values. Setting the shrink parameter to 0.0 ensures that the - // table never shrinks. - void get_resizing_parameters(float* shrink, float* grow) const { - *shrink = settings.shrink_factor(); - *grow = settings.enlarge_factor(); - } - void set_resizing_parameters(float shrink, float grow) { - settings.set_resizing_parameters(shrink, grow); - settings.reset_thresholds(bucket_count()); - } - - // CONSTRUCTORS -- as required by the specs, we take a size, - // but also let you specify a hashfunction, key comparator, - // and key extractor. We also define a copy constructor and =. - // DESTRUCTOR -- needs to free the table - explicit dense_hashtable(size_type expected_max_items_in_table = 0, - const HashFcn& hf = HashFcn(), - const EqualKey& eql = EqualKey(), - const ExtractKey& ext = ExtractKey(), - const SetKey& set = SetKey(), - const Alloc& alloc = Alloc()) - : settings(hf), - key_info(ext, set, eql), - num_deleted(0), - num_elements(0), - num_buckets(expected_max_items_in_table == 0 - ? HT_DEFAULT_STARTING_BUCKETS - : settings.min_buckets(expected_max_items_in_table, 0)), - val_info(alloc_impl(alloc)), - table(NULL) { - // table is NULL until emptyval is set. However, we set num_buckets - // here so we know how much space to allocate once emptyval is set - settings.reset_thresholds(bucket_count()); - } - - // As a convenience for resize(), we allow an optional second argument - // which lets you make this new hashtable a different size than ht - dense_hashtable(const dense_hashtable& ht, - size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS) - : settings(ht.settings), - key_info(ht.key_info), - num_deleted(0), - num_elements(0), - num_buckets(0), - val_info(ht.val_info), - table(NULL) { - if (!ht.settings.use_empty()) { - // If use_empty isn't set, copy_from will crash, so we do our own copying. - assert(ht.empty()); - num_buckets = settings.min_buckets(ht.size(), min_buckets_wanted); - settings.reset_thresholds(bucket_count()); - return; - } - settings.reset_thresholds(bucket_count()); - copy_from(ht, min_buckets_wanted); // copy_from() ignores deleted entries - } - - dense_hashtable& operator= (const dense_hashtable& ht) { - if (&ht == this) return *this; // don't copy onto ourselves - if (!ht.settings.use_empty()) { - assert(ht.empty()); - dense_hashtable empty_table(ht); // empty table with ht's thresholds - this->swap(empty_table); - return *this; - } - settings = ht.settings; - key_info = ht.key_info; - set_value(&val_info.emptyval, ht.val_info.emptyval); - // copy_from() calls clear and sets num_deleted to 0 too - copy_from(ht, HT_MIN_BUCKETS); - // we purposefully don't copy the allocator, which may not be copyable - return *this; - } - - ~dense_hashtable() { - if (table) { - destroy_buckets(0, num_buckets); - val_info.deallocate(table, num_buckets); - } - } - - // Many STL algorithms use swap instead of copy constructors - void swap(dense_hashtable& ht) { - std::swap(settings, ht.settings); - std::swap(key_info, ht.key_info); - std::swap(num_deleted, ht.num_deleted); - std::swap(num_elements, ht.num_elements); - std::swap(num_buckets, ht.num_buckets); - { value_type tmp; // for annoying reasons, swap() doesn't work - set_value(&tmp, val_info.emptyval); - set_value(&val_info.emptyval, ht.val_info.emptyval); - set_value(&ht.val_info.emptyval, tmp); - } - std::swap(table, ht.table); - settings.reset_thresholds(bucket_count()); // also resets consider_shrink - ht.settings.reset_thresholds(ht.bucket_count()); - // we purposefully don't swap the allocator, which may not be swap-able - } - - private: - void clear_to_size(size_type new_num_buckets) { - if (!table) { - table = val_info.allocate(new_num_buckets); - } else { - destroy_buckets(0, num_buckets); - if (new_num_buckets != num_buckets) { // resize, if necessary - typedef base::integral_constant >::value> - realloc_ok; - resize_table(num_buckets, new_num_buckets, realloc_ok()); - } - } - assert(table); - fill_range_with_empty(table, table + new_num_buckets); - num_elements = 0; - num_deleted = 0; - num_buckets = new_num_buckets; // our new size - settings.reset_thresholds(bucket_count()); - } - - public: - // It's always nice to be able to clear a table without deallocating it - void clear() { - // If the table is already empty, and the number of buckets is - // already as we desire, there's nothing to do. - const size_type new_num_buckets = settings.min_buckets(0, 0); - if (num_elements == 0 && new_num_buckets == num_buckets) { - return; - } - clear_to_size(new_num_buckets); - } - - // Clear the table without resizing it. - // Mimicks the stl_hashtable's behaviour when clear()-ing in that it - // does not modify the bucket count - void clear_no_resize() { - if (num_elements > 0) { - assert(table); - destroy_buckets(0, num_buckets); - fill_range_with_empty(table, table + num_buckets); - } - // don't consider to shrink before another erase() - settings.reset_thresholds(bucket_count()); - num_elements = 0; - num_deleted = 0; - } - - // LOOKUP ROUTINES - private: - // Returns a pair of positions: 1st where the object is, 2nd where - // it would go if you wanted to insert it. 1st is ILLEGAL_BUCKET - // if object is not found; 2nd is ILLEGAL_BUCKET if it is. - // Note: because of deletions where-to-insert is not trivial: it's the - // first deleted bucket we see, as long as we don't find the key later - std::pair find_position(const key_type &key) const { - size_type num_probes = 0; // how many times we've probed - const size_type bucket_count_minus_one = bucket_count() - 1; - size_type bucknum = hash(key) & bucket_count_minus_one; - size_type insert_pos = ILLEGAL_BUCKET; // where we would insert - while ( 1 ) { // probe until something happens - if ( test_empty(bucknum) ) { // bucket is empty - if ( insert_pos == ILLEGAL_BUCKET ) // found no prior place to insert - return std::pair(ILLEGAL_BUCKET, bucknum); - else - return std::pair(ILLEGAL_BUCKET, insert_pos); - - } else if ( test_deleted(bucknum) ) {// keep searching, but mark to insert - if ( insert_pos == ILLEGAL_BUCKET ) - insert_pos = bucknum; - - } else if ( equals(key, get_key(table[bucknum])) ) { - return std::pair(bucknum, ILLEGAL_BUCKET); - } - ++num_probes; // we're doing another probe - bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one; - assert(num_probes < bucket_count() - && "Hashtable is full: an error in key_equal<> or hash<>"); - } - } - - public: - - iterator find(const key_type& key) { - if ( size() == 0 ) return end(); - std::pair pos = find_position(key); - if ( pos.first == ILLEGAL_BUCKET ) // alas, not there - return end(); - else - return iterator(this, table + pos.first, table + num_buckets, false); - } - - const_iterator find(const key_type& key) const { - if ( size() == 0 ) return end(); - std::pair pos = find_position(key); - if ( pos.first == ILLEGAL_BUCKET ) // alas, not there - return end(); - else - return const_iterator(this, table + pos.first, table+num_buckets, false); - } - - // This is a tr1 method: the bucket a given key is in, or what bucket - // it would be put in, if it were to be inserted. Shrug. - size_type bucket(const key_type& key) const { - std::pair pos = find_position(key); - return pos.first == ILLEGAL_BUCKET ? pos.second : pos.first; - } - - // Counts how many elements have key key. For maps, it's either 0 or 1. - size_type count(const key_type &key) const { - std::pair pos = find_position(key); - return pos.first == ILLEGAL_BUCKET ? 0 : 1; - } - - // Likewise, equal_range doesn't really make sense for us. Oh well. - std::pair equal_range(const key_type& key) { - iterator pos = find(key); // either an iterator or end - if (pos == end()) { - return std::pair(pos, pos); - } else { - const iterator startpos = pos++; - return std::pair(startpos, pos); - } - } - std::pair equal_range(const key_type& key) - const { - const_iterator pos = find(key); // either an iterator or end - if (pos == end()) { - return std::pair(pos, pos); - } else { - const const_iterator startpos = pos++; - return std::pair(startpos, pos); - } - } - - - // INSERTION ROUTINES - private: - // Private method used by insert_noresize and find_or_insert. - iterator insert_at(const_reference obj, size_type pos) { - if (size() >= max_size()) { - throw std::length_error("insert overflow"); - } - if ( test_deleted(pos) ) { // just replace if it's been del. - // shrug: shouldn't need to be const. - const_iterator delpos(this, table + pos, table + num_buckets, false); - clear_deleted(delpos); - assert( num_deleted > 0); - --num_deleted; // used to be, now it isn't - } else { - ++num_elements; // replacing an empty bucket - } - set_value(&table[pos], obj); - return iterator(this, table + pos, table + num_buckets, false); - } - - // If you know *this is big enough to hold obj, use this routine - std::pair insert_noresize(const_reference obj) { - // First, double-check we're not inserting delkey or emptyval - assert((!settings.use_empty() || !equals(get_key(obj), - get_key(val_info.emptyval))) - && "Inserting the empty key"); - assert((!settings.use_deleted() || !equals(get_key(obj), key_info.delkey)) - && "Inserting the deleted key"); - const std::pair pos = find_position(get_key(obj)); - if ( pos.first != ILLEGAL_BUCKET) { // object was already there - return std::pair(iterator(this, table + pos.first, - table + num_buckets, false), - false); // false: we didn't insert - } else { // pos.second says where to put it - return std::pair(insert_at(obj, pos.second), true); - } - } - - // Specializations of insert(it, it) depending on the power of the iterator: - // (1) Iterator supports operator-, resize before inserting - template - void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) { - size_t dist = std::distance(f, l); - if (dist >= (std::numeric_limits::max)()) { - throw std::length_error("insert-range overflow"); - } - resize_delta(static_cast(dist)); - for ( ; dist > 0; --dist, ++f) { - insert_noresize(*f); - } - } - - // (2) Arbitrary iterator, can't tell how much to resize - template - void insert(InputIterator f, InputIterator l, std::input_iterator_tag) { - for ( ; f != l; ++f) - insert(*f); - } - - public: - // This is the normal insert routine, used by the outside world - std::pair insert(const_reference obj) { - resize_delta(1); // adding an object, grow if need be - return insert_noresize(obj); - } - - // When inserting a lot at a time, we specialize on the type of iterator - template - void insert(InputIterator f, InputIterator l) { - // specializes on iterator type - insert(f, l, - typename std::iterator_traits::iterator_category()); - } - - // DefaultValue is a functor that takes a key and returns a value_type - // representing the default value to be inserted if none is found. - template - value_type& find_or_insert(const key_type& key) { - // First, double-check we're not inserting emptykey or delkey - assert((!settings.use_empty() || !equals(key, get_key(val_info.emptyval))) - && "Inserting the empty key"); - assert((!settings.use_deleted() || !equals(key, key_info.delkey)) - && "Inserting the deleted key"); - const std::pair pos = find_position(key); - DefaultValue default_value; - if ( pos.first != ILLEGAL_BUCKET) { // object was already there - return table[pos.first]; - } else if (resize_delta(1)) { // needed to rehash to make room - // Since we resized, we can't use pos, so recalculate where to insert. - return *insert_noresize(default_value(key)).first; - } else { // no need to rehash, insert right here - return *insert_at(default_value(key), pos.second); - } - } - - - // DELETION ROUTINES - size_type erase(const key_type& key) { - // First, double-check we're not trying to erase delkey or emptyval. - assert((!settings.use_empty() || !equals(key, get_key(val_info.emptyval))) - && "Erasing the empty key"); - assert((!settings.use_deleted() || !equals(key, key_info.delkey)) - && "Erasing the deleted key"); - const_iterator pos = find(key); // shrug: shouldn't need to be const - if ( pos != end() ) { - assert(!test_deleted(pos)); // or find() shouldn't have returned it - set_deleted(pos); - ++num_deleted; - settings.set_consider_shrink(true); // will think about shrink after next insert - return 1; // because we deleted one thing - } else { - return 0; // because we deleted nothing - } - } - - // We return the iterator past the deleted item. - void erase(iterator pos) { - if ( pos == end() ) return; // sanity check - if ( set_deleted(pos) ) { // true if object has been newly deleted - ++num_deleted; - settings.set_consider_shrink(true); // will think about shrink after next insert - } - } - - void erase(iterator f, iterator l) { - for ( ; f != l; ++f) { - if ( set_deleted(f) ) // should always be true - ++num_deleted; - } - settings.set_consider_shrink(true); // will think about shrink after next insert - } - - // We allow you to erase a const_iterator just like we allow you to - // erase an iterator. This is in parallel to 'delete': you can delete - // a const pointer just like a non-const pointer. The logic is that - // you can't use the object after it's erased anyway, so it doesn't matter - // if it's const or not. - void erase(const_iterator pos) { - if ( pos == end() ) return; // sanity check - if ( set_deleted(pos) ) { // true if object has been newly deleted - ++num_deleted; - settings.set_consider_shrink(true); // will think about shrink after next insert - } - } - void erase(const_iterator f, const_iterator l) { - for ( ; f != l; ++f) { - if ( set_deleted(f) ) // should always be true - ++num_deleted; - } - settings.set_consider_shrink(true); // will think about shrink after next insert - } - - - // COMPARISON - bool operator==(const dense_hashtable& ht) const { - if (size() != ht.size()) { - return false; - } else if (this == &ht) { - return true; - } else { - // Iterate through the elements in "this" and see if the - // corresponding element is in ht - for ( const_iterator it = begin(); it != end(); ++it ) { - const_iterator it2 = ht.find(get_key(*it)); - if ((it2 == ht.end()) || (*it != *it2)) { - return false; - } - } - return true; - } - } - bool operator!=(const dense_hashtable& ht) const { - return !(*this == ht); - } - - - // I/O - // We support reading and writing hashtables to disk. Alas, since - // I don't know how to write a hasher or key_equal, you have to make - // sure everything but the table is the same. We compact before writing. - private: - // Every time the disk format changes, this should probably change too - typedef unsigned long MagicNumberType; - static const MagicNumberType MAGIC_NUMBER = 0x13578642; - - public: - // I/O -- this is an add-on for writing hash table to disk - // - // INPUT and OUTPUT must be either a FILE, *or* a C++ stream - // (istream, ostream, etc) *or* a class providing - // Read(void*, size_t) and Write(const void*, size_t) - // (respectively), which writes a buffer into a stream - // (which the INPUT/OUTPUT instance presumably owns). - - typedef sparsehash_internal::pod_serializer NopointerSerializer; - - // ValueSerializer: a functor. operator()(OUTPUT*, const value_type&) - template - bool serialize(ValueSerializer serializer, OUTPUT *fp) { - squash_deleted(); // so we don't have to worry about delkey - if ( !sparsehash_internal::write_bigendian_number(fp, MAGIC_NUMBER, 4) ) - return false; - if ( !sparsehash_internal::write_bigendian_number(fp, num_buckets, 8) ) - return false; - if ( !sparsehash_internal::write_bigendian_number(fp, num_elements, 8) ) - return false; - // Now write a bitmap of non-empty buckets. - for ( size_type i = 0; i < num_buckets; i += 8 ) { - unsigned char bits = 0; - for ( int bit = 0; bit < 8; ++bit ) { - if ( i + bit < num_buckets && !test_empty(i + bit) ) - bits |= (1 << bit); - } - if ( !sparsehash_internal::write_data(fp, &bits, sizeof(bits)) ) - return false; - for ( int bit = 0; bit < 8; ++bit ) { - if ( bits & (1 << bit) ) { - if ( !serializer(fp, table[i + bit]) ) return false; - } - } - } - return true; - } - - // INPUT: anything we've written an overload of read_data() for. - // ValueSerializer: a functor. operator()(INPUT*, value_type*) - template - bool unserialize(ValueSerializer serializer, INPUT *fp) { - assert(settings.use_empty() && "empty_key not set for read"); - - clear(); // just to be consistent - MagicNumberType magic_read; - if ( !sparsehash_internal::read_bigendian_number(fp, &magic_read, 4) ) - return false; - if ( magic_read != MAGIC_NUMBER ) { - return false; - } - size_type new_num_buckets; - if ( !sparsehash_internal::read_bigendian_number(fp, &new_num_buckets, 8) ) - return false; - clear_to_size(new_num_buckets); - if ( !sparsehash_internal::read_bigendian_number(fp, &num_elements, 8) ) - return false; - - // Read the bitmap of non-empty buckets. - for (size_type i = 0; i < num_buckets; i += 8) { - unsigned char bits; - if ( !sparsehash_internal::read_data(fp, &bits, sizeof(bits)) ) - return false; - for ( int bit = 0; bit < 8; ++bit ) { - if ( i + bit < num_buckets && (bits & (1 << bit)) ) { // not empty - if ( !serializer(fp, &table[i + bit]) ) return false; - } - } - } - return true; - } - - private: - template - class alloc_impl : public A { - public: - typedef typename A::pointer pointer; - typedef typename A::size_type size_type; - - // Convert a normal allocator to one that has realloc_or_die() - alloc_impl(const A& a) : A(a) { } - - // realloc_or_die should only be used when using the default - // allocator (libc_allocator_with_realloc). - pointer realloc_or_die(pointer /*ptr*/, size_type /*n*/) { - fprintf(stderr, "realloc_or_die is only supported for " - "libc_allocator_with_realloc\n"); - exit(1); - return NULL; - } - }; - - // A template specialization of alloc_impl for - // libc_allocator_with_realloc that can handle realloc_or_die. - template - class alloc_impl > - : public libc_allocator_with_realloc { - public: - typedef typename libc_allocator_with_realloc::pointer pointer; - typedef typename libc_allocator_with_realloc::size_type size_type; - - alloc_impl(const libc_allocator_with_realloc& a) - : libc_allocator_with_realloc(a) { } - - pointer realloc_or_die(pointer ptr, size_type n) { - pointer retval = this->reallocate(ptr, n); - if (retval == NULL) { - fprintf(stderr, "sparsehash: FATAL ERROR: failed to reallocate " - "%lu elements for ptr %p", static_cast(n), ptr); - exit(1); - } - return retval; - } - }; - - // Package allocator with emptyval to eliminate memory needed for - // the zero-size allocator. - // If new fields are added to this class, we should add them to - // operator= and swap. - class ValInfo : public alloc_impl { - public: - typedef typename alloc_impl::value_type value_type; - - ValInfo(const alloc_impl& a) - : alloc_impl(a), emptyval() { } - ValInfo(const ValInfo& v) - : alloc_impl(v), emptyval(v.emptyval) { } - - value_type emptyval; // which key marks unused entries - }; - - - // Package functors with another class to eliminate memory needed for - // zero-size functors. Since ExtractKey and hasher's operator() might - // have the same function signature, they must be packaged in - // different classes. - struct Settings : - sparsehash_internal::sh_hashtable_settings { - explicit Settings(const hasher& hf) - : sparsehash_internal::sh_hashtable_settings( - hf, HT_OCCUPANCY_PCT / 100.0f, HT_EMPTY_PCT / 100.0f) {} - }; - - // Packages ExtractKey and SetKey functors. - class KeyInfo : public ExtractKey, public SetKey, public EqualKey { - public: - KeyInfo(const ExtractKey& ek, const SetKey& sk, const EqualKey& eq) - : ExtractKey(ek), - SetKey(sk), - EqualKey(eq) { - } - - // We want to return the exact same type as ExtractKey: Key or const Key& - typename ExtractKey::result_type get_key(const_reference v) const { - return ExtractKey::operator()(v); - } - void set_key(pointer v, const key_type& k) const { - SetKey::operator()(v, k); - } - bool equals(const key_type& a, const key_type& b) const { - return EqualKey::operator()(a, b); - } - - // Which key marks deleted entries. - // TODO(csilvers): make a pointer, and get rid of use_deleted (benchmark!) - typename base::remove_const::type delkey; - }; - - // Utility functions to access the templated operators - size_type hash(const key_type& v) const { - return settings.hash(v); - } - bool equals(const key_type& a, const key_type& b) const { - return key_info.equals(a, b); - } - typename ExtractKey::result_type get_key(const_reference v) const { - return key_info.get_key(v); - } - void set_key(pointer v, const key_type& k) const { - key_info.set_key(v, k); - } - - private: - // Actual data - Settings settings; - KeyInfo key_info; - - size_type num_deleted; // how many occupied buckets are marked deleted - size_type num_elements; - size_type num_buckets; - ValInfo val_info; // holds emptyval, and also the allocator - pointer table; -}; - - -// We need a global swap as well -template -inline void swap(dense_hashtable &x, - dense_hashtable &y) { - x.swap(y); -} - -#undef JUMP_ - -template -const typename dense_hashtable::size_type - dense_hashtable::ILLEGAL_BUCKET; - -// How full we let the table get before we resize. Knuth says .8 is -// good -- higher causes us to probe too much, though saves memory. -// However, we go with .5, getting better performance at the cost of -// more space (a trade-off densehashtable explicitly chooses to make). -// Feel free to play around with different values, though, via -// max_load_factor() and/or set_resizing_parameters(). -template -const int dense_hashtable::HT_OCCUPANCY_PCT = 50; - -// How empty we let the table get before we resize lower. -// It should be less than OCCUPANCY_PCT / 2 or we thrash resizing. -template -const int dense_hashtable::HT_EMPTY_PCT - = static_cast(0.4 * - dense_hashtable::HT_OCCUPANCY_PCT); - -_END_GOOGLE_NAMESPACE_ - -#endif /* _DENSEHASHTABLE_H_ */ diff --git a/contrib/libsparsehash/sparsehash/internal/hashtable-common.h b/contrib/libsparsehash/sparsehash/internal/hashtable-common.h deleted file mode 100644 index 4049925789c..00000000000 --- a/contrib/libsparsehash/sparsehash/internal/hashtable-common.h +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright (c) 2010, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// Provides classes shared by both sparse and dense hashtable. -// -// sh_hashtable_settings has parameters for growing and shrinking -// a hashtable. It also packages zero-size functor (ie. hasher). -// -// Other functions and classes provide common code for serializing -// and deserializing hashtables to a stream (such as a FILE*). - -#ifndef UTIL_GTL_HASHTABLE_COMMON_H_ -#define UTIL_GTL_HASHTABLE_COMMON_H_ - -#include -#include -#include -#include // for size_t -#include -#include // For length_error - -_START_GOOGLE_NAMESPACE_ - -template struct SparsehashCompileAssert { }; -#define SPARSEHASH_COMPILE_ASSERT(expr, msg) \ - static_assert(expr, #msg) - -namespace sparsehash_internal { - -// Adaptor methods for reading/writing data from an INPUT or OUPTUT -// variable passed to serialize() or unserialize(). For now we -// have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note -// they are pointers, unlike typical use), or else a pointer to -// something that supports a Read()/Write() method. -// -// For technical reasons, we implement read_data/write_data in two -// stages. The actual work is done in *_data_internal, which takes -// the stream argument twice: once as a template type, and once with -// normal type information. (We only use the second version.) We do -// this because of how C++ picks what function overload to use. If we -// implemented this the naive way: -// bool read_data(istream* is, const void* data, size_t length); -// template read_data(T* fp, const void* data, size_t length); -// C++ would prefer the second version for every stream type except -// istream. However, we want C++ to prefer the first version for -// streams that are *subclasses* of istream, such as istringstream. -// This is not possible given the way template types are resolved. So -// we split the stream argument in two, one of which is templated and -// one of which is not. The specialized functions (like the istream -// version above) ignore the template arg and use the second, 'type' -// arg, getting subclass matching as normal. The 'catch-all' -// functions (the second version above) use the template arg to deduce -// the type, and use a second, void* arg to achieve the desired -// 'catch-all' semantics. - -// ----- low-level I/O for FILE* ---- - -template -inline bool read_data_internal(Ignored*, FILE* fp, - void* data, size_t length) { - return fread(data, length, 1, fp) == 1; -} - -template -inline bool write_data_internal(Ignored*, FILE* fp, - const void* data, size_t length) { - return fwrite(data, length, 1, fp) == 1; -} - -// ----- low-level I/O for iostream ---- - -// We want the caller to be responsible for #including , not -// us, because iostream is a big header! According to the standard, -// it's only legal to delay the instantiation the way we want to if -// the istream/ostream is a template type. So we jump through hoops. -template -inline bool read_data_internal_for_istream(ISTREAM* fp, - void* data, size_t length) { - return fp->read(reinterpret_cast(data), length).good(); -} -template -inline bool read_data_internal(Ignored*, std::istream* fp, - void* data, size_t length) { - return read_data_internal_for_istream(fp, data, length); -} - -template -inline bool write_data_internal_for_ostream(OSTREAM* fp, - const void* data, size_t length) { - return fp->write(reinterpret_cast(data), length).good(); -} -template -inline bool write_data_internal(Ignored*, std::ostream* fp, - const void* data, size_t length) { - return write_data_internal_for_ostream(fp, data, length); -} - -// ----- low-level I/O for custom streams ---- - -// The INPUT type needs to support a Read() method that takes a -// buffer and a length and returns the number of bytes read. -template -inline bool read_data_internal(INPUT* fp, void*, - void* data, size_t length) { - return static_cast(fp->Read(data, length)) == length; -} - -// The OUTPUT type needs to support a Write() operation that takes -// a buffer and a length and returns the number of bytes written. -template -inline bool write_data_internal(OUTPUT* fp, void*, - const void* data, size_t length) { - return static_cast(fp->Write(data, length)) == length; -} - -// ----- low-level I/O: the public API ---- - -template -inline bool read_data(INPUT* fp, void* data, size_t length) { - return read_data_internal(fp, fp, data, length); -} - -template -inline bool write_data(OUTPUT* fp, const void* data, size_t length) { - return write_data_internal(fp, fp, data, length); -} - -// Uses read_data() and write_data() to read/write an integer. -// length is the number of bytes to read/write (which may differ -// from sizeof(IntType), allowing us to save on a 32-bit system -// and load on a 64-bit system). Excess bytes are taken to be 0. -// INPUT and OUTPUT must match legal inputs to read/write_data (above). -template -bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) { - *value = 0; - unsigned char byte; - // We require IntType to be unsigned or else the shifting gets all screwy. - SPARSEHASH_COMPILE_ASSERT(static_cast(-1) > static_cast(0), - serializing_int_requires_an_unsigned_type); - for (size_t i = 0; i < length; ++i) { - if (!read_data(fp, &byte, sizeof(byte))) return false; - *value |= static_cast(byte) << ((length - 1 - i) * 8); - } - return true; -} - -template -bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) { - unsigned char byte; - // We require IntType to be unsigned or else the shifting gets all screwy. - SPARSEHASH_COMPILE_ASSERT(static_cast(-1) > static_cast(0), - serializing_int_requires_an_unsigned_type); - for (size_t i = 0; i < length; ++i) { - byte = (sizeof(value) <= length-1 - i) - ? 0 : static_cast((value >> ((length-1 - i) * 8)) & 255); - if (!write_data(fp, &byte, sizeof(byte))) return false; - } - return true; -} - -// If your keys and values are simple enough, you can pass this -// serializer to serialize()/unserialize(). "Simple enough" means -// value_type is a POD type that contains no pointers. Note, -// however, we don't try to normalize endianness. -// This is the type used for NopointerSerializer. -template struct pod_serializer { - template - bool operator()(INPUT* fp, value_type* value) const { - return read_data(fp, value, sizeof(*value)); - } - - template - bool operator()(OUTPUT* fp, const value_type& value) const { - return write_data(fp, &value, sizeof(value)); - } -}; - - -// Settings contains parameters for growing and shrinking the table. -// It also packages zero-size functor (ie. hasher). -// -// It does some munging of the hash value in cases where we think -// (fear) the original hash function might not be very good. In -// particular, the default hash of pointers is the identity hash, -// so probably all the low bits are 0. We identify when we think -// we're hashing a pointer, and chop off the low bits. Note this -// isn't perfect: even when the key is a pointer, we can't tell -// for sure that the hash is the identity hash. If it's not, this -// is needless work (and possibly, though not likely, harmful). - -template -class sh_hashtable_settings : public HashFunc { - public: - typedef Key key_type; - typedef HashFunc hasher; - typedef SizeType size_type; - - public: - sh_hashtable_settings(const hasher& hf, - const float ht_occupancy_flt, - const float ht_empty_flt) - : hasher(hf), - enlarge_threshold_(0), - shrink_threshold_(0), - consider_shrink_(false), - use_empty_(false), - use_deleted_(false), - num_ht_copies_(0) { - set_enlarge_factor(ht_occupancy_flt); - set_shrink_factor(ht_empty_flt); - } - - size_type hash(const key_type& v) const { - // We munge the hash value when we don't trust hasher::operator(). - return hash_munger::MungedHash(hasher::operator()(v)); - } - - float enlarge_factor() const { - return enlarge_factor_; - } - void set_enlarge_factor(float f) { - enlarge_factor_ = f; - } - float shrink_factor() const { - return shrink_factor_; - } - void set_shrink_factor(float f) { - shrink_factor_ = f; - } - - size_type enlarge_threshold() const { - return enlarge_threshold_; - } - void set_enlarge_threshold(size_type t) { - enlarge_threshold_ = t; - } - size_type shrink_threshold() const { - return shrink_threshold_; - } - void set_shrink_threshold(size_type t) { - shrink_threshold_ = t; - } - - size_type enlarge_size(size_type x) const { - return static_cast(x * enlarge_factor_); - } - size_type shrink_size(size_type x) const { - return static_cast(x * shrink_factor_); - } - - bool consider_shrink() const { - return consider_shrink_; - } - void set_consider_shrink(bool t) { - consider_shrink_ = t; - } - - bool use_empty() const { - return use_empty_; - } - void set_use_empty(bool t) { - use_empty_ = t; - } - - bool use_deleted() const { - return use_deleted_; - } - void set_use_deleted(bool t) { - use_deleted_ = t; - } - - size_type num_ht_copies() const { - return static_cast(num_ht_copies_); - } - void inc_num_ht_copies() { - ++num_ht_copies_; - } - - // Reset the enlarge and shrink thresholds - void reset_thresholds(size_type num_buckets) { - set_enlarge_threshold(enlarge_size(num_buckets)); - set_shrink_threshold(shrink_size(num_buckets)); - // whatever caused us to reset already considered - set_consider_shrink(false); - } - - // Caller is resposible for calling reset_threshold right after - // set_resizing_parameters. - void set_resizing_parameters(float shrink, float grow) { - assert(shrink >= 0.0); - assert(grow <= 1.0); - if (shrink > grow/2.0f) - shrink = grow / 2.0f; // otherwise we thrash hashtable size - set_shrink_factor(shrink); - set_enlarge_factor(grow); - } - - // This is the smallest size a hashtable can be without being too crowded - // If you like, you can give a min #buckets as well as a min #elts - size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) { - float enlarge = enlarge_factor(); - size_type sz = HT_MIN_BUCKETS; // min buckets allowed - while ( sz < min_buckets_wanted || - num_elts >= static_cast(sz * enlarge) ) { - // This just prevents overflowing size_type, since sz can exceed - // max_size() here. - if (static_cast(sz * 2) < sz) { - throw std::length_error("resize overflow"); // protect against overflow - } - sz *= 2; - } - return sz; - } - - private: - template class hash_munger { - public: - static size_t MungedHash(size_t hash) { - return hash; - } - }; - // This matches when the hashtable key is a pointer. - template class hash_munger { - public: - static size_t MungedHash(size_t hash) { - // TODO(csilvers): consider rotating instead: - // static const int shift = (sizeof(void *) == 4) ? 2 : 3; - // return (hash << (sizeof(hash) * 8) - shift)) | (hash >> shift); - // This matters if we ever change sparse/dense_hash_* to compare - // hashes before comparing actual values. It's speedy on x86. - return hash / sizeof(void*); // get rid of known-0 bits - } - }; - - size_type enlarge_threshold_; // table.size() * enlarge_factor - size_type shrink_threshold_; // table.size() * shrink_factor - float enlarge_factor_; // how full before resize - float shrink_factor_; // how empty before resize - // consider_shrink=true if we should try to shrink before next insert - bool consider_shrink_; - bool use_empty_; // used only by densehashtable, not sparsehashtable - bool use_deleted_; // false until delkey has been set - // num_ht_copies is a counter incremented every Copy/Move - unsigned int num_ht_copies_; -}; - -} // namespace sparsehash_internal - -#undef SPARSEHASH_COMPILE_ASSERT -_END_GOOGLE_NAMESPACE_ - -#endif // UTIL_GTL_HASHTABLE_COMMON_H_ diff --git a/contrib/libsparsehash/sparsehash/internal/libc_allocator_with_realloc.h b/contrib/libsparsehash/sparsehash/internal/libc_allocator_with_realloc.h deleted file mode 100644 index 0c1e03d7565..00000000000 --- a/contrib/libsparsehash/sparsehash/internal/libc_allocator_with_realloc.h +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2010, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- - -#ifndef UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ -#define UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ - -#include -#include // for malloc/realloc/free -#include // for ptrdiff_t -#include // for placement new - -_START_GOOGLE_NAMESPACE_ - -template -class libc_allocator_with_realloc { - public: - typedef T value_type; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - - libc_allocator_with_realloc() {} - libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} - ~libc_allocator_with_realloc() {} - - pointer address(reference r) const { return &r; } - const_pointer address(const_reference r) const { return &r; } - - pointer allocate(size_type n, const_pointer = 0) { - return static_cast(malloc(n * sizeof(value_type))); - } - void deallocate(pointer p, size_type) { - free(p); - } - pointer reallocate(pointer p, size_type n) { - return static_cast(realloc(p, n * sizeof(value_type))); - } - - size_type max_size() const { - return static_cast(-1) / sizeof(value_type); - } - - void construct(pointer p, const value_type& val) { - new(p) value_type(val); - } - void destroy(pointer p) { p->~value_type(); } - - template - libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} - - template - struct rebind { - typedef libc_allocator_with_realloc other; - }; -}; - -// libc_allocator_with_realloc specialization. -template<> -class libc_allocator_with_realloc { - public: - typedef void value_type; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - typedef void* pointer; - typedef const void* const_pointer; - - template - struct rebind { - typedef libc_allocator_with_realloc other; - }; -}; - -template -inline bool operator==(const libc_allocator_with_realloc&, - const libc_allocator_with_realloc&) { - return true; -} - -template -inline bool operator!=(const libc_allocator_with_realloc&, - const libc_allocator_with_realloc&) { - return false; -} - -_END_GOOGLE_NAMESPACE_ - -#endif // UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ diff --git a/contrib/libsparsehash/sparsehash/internal/sparseconfig.h b/contrib/libsparsehash/sparsehash/internal/sparseconfig.h deleted file mode 100644 index 18f4b2dcc5d..00000000000 --- a/contrib/libsparsehash/sparsehash/internal/sparseconfig.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * NOTE: This file is for internal use only. - * Do not use these #defines in your own program! - */ - -/* Namespace for Google classes */ -#define GOOGLE_NAMESPACE ::google - -/* the location of the header defining hash functions */ -#define HASH_FUN_H - -/* the namespace of the hash<> function */ -#define HASH_NAMESPACE std - -/* Define to 1 if you have the header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define to 1 if the system has the type `long long'. */ -#define HAVE_LONG_LONG 1 - -/* Define to 1 if you have the `memcpy' function. */ -#define HAVE_MEMCPY 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if the system has the type `uint16_t'. */ -#define HAVE_UINT16_T 1 - -/* Define to 1 if the system has the type `u_int16_t'. */ -#define HAVE_U_INT16_T 1 - -/* Define to 1 if the system has the type `__uint16'. */ -/* #undef HAVE___UINT16 */ - -/* The system-provided hash function including the namespace. */ -#define SPARSEHASH_HASH HASH_NAMESPACE::hash - -/* Stops putting the code inside the Google namespace */ -#define _END_GOOGLE_NAMESPACE_ } - -/* Puts following code inside the Google namespace */ -#define _START_GOOGLE_NAMESPACE_ namespace google { diff --git a/contrib/libsparsehash/sparsehash/internal/sparsehashtable.h b/contrib/libsparsehash/sparsehash/internal/sparsehashtable.h deleted file mode 100644 index 7ee1391da1b..00000000000 --- a/contrib/libsparsehash/sparsehash/internal/sparsehashtable.h +++ /dev/null @@ -1,1247 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// A sparse hashtable is a particular implementation of -// a hashtable: one that is meant to minimize memory use. -// It does this by using a *sparse table* (cf sparsetable.h), -// which uses between 1 and 2 bits to store empty buckets -// (we may need another bit for hashtables that support deletion). -// -// When empty buckets are so cheap, an appealing hashtable -// implementation is internal probing, in which the hashtable -// is a single table, and collisions are resolved by trying -// to insert again in another bucket. The most cache-efficient -// internal probing schemes are linear probing (which suffers, -// alas, from clumping) and quadratic probing, which is what -// we implement by default. -// -// Deleted buckets are a bit of a pain. We have to somehow mark -// deleted buckets (the probing must distinguish them from empty -// buckets). The most principled way is to have another bitmap, -// but that's annoying and takes up space. Instead we let the -// user specify an "impossible" key. We set deleted buckets -// to have the impossible key. -// -// Note it is possible to change the value of the delete key -// on the fly; you can even remove it, though after that point -// the hashtable is insert_only until you set it again. -// -// You probably shouldn't use this code directly. Use -// sparse_hash_map<> or sparse_hash_set<> instead. -// -// You can modify the following, below: -// HT_OCCUPANCY_PCT -- how full before we double size -// HT_EMPTY_PCT -- how empty before we halve size -// HT_MIN_BUCKETS -- smallest bucket size -// HT_DEFAULT_STARTING_BUCKETS -- default bucket size at construct-time -// -// You can also change enlarge_factor (which defaults to -// HT_OCCUPANCY_PCT), and shrink_factor (which defaults to -// HT_EMPTY_PCT) with set_resizing_parameters(). -// -// How to decide what values to use? -// shrink_factor's default of .4 * OCCUPANCY_PCT, is probably good. -// HT_MIN_BUCKETS is probably unnecessary since you can specify -// (indirectly) the starting number of buckets at construct-time. -// For enlarge_factor, you can use this chart to try to trade-off -// expected lookup time to the space taken up. By default, this -// code uses quadratic probing, though you can change it to linear -// via _JUMP below if you really want to. -// -// From http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html -// NUMBER OF PROBES / LOOKUP Successful Unsuccessful -// Quadratic collision resolution 1 - ln(1-L) - L/2 1/(1-L) - L - ln(1-L) -// Linear collision resolution [1+1/(1-L)]/2 [1+1/(1-L)2]/2 -// -// -- enlarge_factor -- 0.10 0.50 0.60 0.75 0.80 0.90 0.99 -// QUADRATIC COLLISION RES. -// probes/successful lookup 1.05 1.44 1.62 2.01 2.21 2.85 5.11 -// probes/unsuccessful lookup 1.11 2.19 2.82 4.64 5.81 11.4 103.6 -// LINEAR COLLISION RES. -// probes/successful lookup 1.06 1.5 1.75 2.5 3.0 5.5 50.5 -// probes/unsuccessful lookup 1.12 2.5 3.6 8.5 13.0 50.0 5000.0 -// -// The value type is required to be copy constructible and default -// constructible, but it need not be (and commonly isn't) assignable. - -#ifndef _SPARSEHASHTABLE_H_ -#define _SPARSEHASHTABLE_H_ - -#include -#include -#include // For swap(), eg -#include // for iterator tags -#include // for numeric_limits -#include // for pair -#include // for remove_const -#include -#include // IWYU pragma: export -#include // For length_error - -_START_GOOGLE_NAMESPACE_ - -namespace base { // just to make google->opensource transition easier -using GOOGLE_NAMESPACE::remove_const; -} - -#ifndef SPARSEHASH_STAT_UPDATE -#define SPARSEHASH_STAT_UPDATE(x) ((void) 0) -#endif - -// The probing method -// Linear probing -// #define JUMP_(key, num_probes) ( 1 ) -// Quadratic probing -#define JUMP_(key, num_probes) ( num_probes ) - -// The smaller this is, the faster lookup is (because the group bitmap is -// smaller) and the faster insert is, because there's less to move. -// On the other hand, there are more groups. Since group::size_type is -// a short, this number should be of the form 32*x + 16 to avoid waste. -static const u_int16_t DEFAULT_GROUP_SIZE = 48; // fits in 1.5 words - -// Hashtable class, used to implement the hashed associative containers -// hash_set and hash_map. -// -// Value: what is stored in the table (each bucket is a Value). -// Key: something in a 1-to-1 correspondence to a Value, that can be used -// to search for a Value in the table (find() takes a Key). -// HashFcn: Takes a Key and returns an integer, the more unique the better. -// ExtractKey: given a Value, returns the unique Key associated with it. -// Must inherit from unary_function, or at least have a -// result_type enum indicating the return type of operator(). -// SetKey: given a Value* and a Key, modifies the value such that -// ExtractKey(value) == key. We guarantee this is only called -// with key == deleted_key. -// EqualKey: Given two Keys, says whether they are the same (that is, -// if they are both associated with the same Value). -// Alloc: STL allocator to use to allocate memory. - -template -class sparse_hashtable; - -template -struct sparse_hashtable_iterator; - -template -struct sparse_hashtable_const_iterator; - -// As far as iterating, we're basically just a sparsetable -// that skips over deleted elements. -template -struct sparse_hashtable_iterator { - private: - typedef typename A::template rebind::other value_alloc_type; - - public: - typedef sparse_hashtable_iterator iterator; - typedef sparse_hashtable_const_iterator const_iterator; - typedef typename sparsetable::nonempty_iterator - st_iterator; - - typedef std::forward_iterator_tag iterator_category; // very little defined! - typedef V value_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::pointer pointer; - - // "Real" constructor and default constructor - sparse_hashtable_iterator(const sparse_hashtable *h, - st_iterator it, st_iterator it_end) - : ht(h), pos(it), end(it_end) { advance_past_deleted(); } - sparse_hashtable_iterator() { } // not ever used internally - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *pos; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic. The only hard part is making sure that - // we're not on a marked-deleted array element - void advance_past_deleted() { - while ( pos != end && ht->test_deleted(*this) ) - ++pos; - } - iterator& operator++() { - assert(pos != end); ++pos; advance_past_deleted(); return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - - // Comparison. - bool operator==(const iterator& it) const { return pos == it.pos; } - bool operator!=(const iterator& it) const { return pos != it.pos; } - - - // The actual data - const sparse_hashtable *ht; - st_iterator pos, end; -}; - -// Now do it all again, but with const-ness! -template -struct sparse_hashtable_const_iterator { - private: - typedef typename A::template rebind::other value_alloc_type; - - public: - typedef sparse_hashtable_iterator iterator; - typedef sparse_hashtable_const_iterator const_iterator; - typedef typename sparsetable::const_nonempty_iterator - st_iterator; - - typedef std::forward_iterator_tag iterator_category; // very little defined! - typedef V value_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::const_reference reference; - typedef typename value_alloc_type::const_pointer pointer; - - // "Real" constructor and default constructor - sparse_hashtable_const_iterator(const sparse_hashtable *h, - st_iterator it, st_iterator it_end) - : ht(h), pos(it), end(it_end) { advance_past_deleted(); } - // This lets us convert regular iterators to const iterators - sparse_hashtable_const_iterator() { } // never used internally - sparse_hashtable_const_iterator(const iterator &it) - : ht(it.ht), pos(it.pos), end(it.end) { } - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *pos; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic. The only hard part is making sure that - // we're not on a marked-deleted array element - void advance_past_deleted() { - while ( pos != end && ht->test_deleted(*this) ) - ++pos; - } - const_iterator& operator++() { - assert(pos != end); ++pos; advance_past_deleted(); return *this; - } - const_iterator operator++(int) { const_iterator tmp(*this); ++*this; return tmp; } - - // Comparison. - bool operator==(const const_iterator& it) const { return pos == it.pos; } - bool operator!=(const const_iterator& it) const { return pos != it.pos; } - - - // The actual data - const sparse_hashtable *ht; - st_iterator pos, end; -}; - -// And once again, but this time freeing up memory as we iterate -template -struct sparse_hashtable_destructive_iterator { - private: - typedef typename A::template rebind::other value_alloc_type; - - public: - typedef sparse_hashtable_destructive_iterator iterator; - typedef typename sparsetable::destructive_iterator - st_iterator; - - typedef std::forward_iterator_tag iterator_category; // very little defined! - typedef V value_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::pointer pointer; - - // "Real" constructor and default constructor - sparse_hashtable_destructive_iterator(const - sparse_hashtable *h, - st_iterator it, st_iterator it_end) - : ht(h), pos(it), end(it_end) { advance_past_deleted(); } - sparse_hashtable_destructive_iterator() { } // never used internally - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *pos; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic. The only hard part is making sure that - // we're not on a marked-deleted array element - void advance_past_deleted() { - while ( pos != end && ht->test_deleted(*this) ) - ++pos; - } - iterator& operator++() { - assert(pos != end); ++pos; advance_past_deleted(); return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - - // Comparison. - bool operator==(const iterator& it) const { return pos == it.pos; } - bool operator!=(const iterator& it) const { return pos != it.pos; } - - - // The actual data - const sparse_hashtable *ht; - st_iterator pos, end; -}; - - -template -class sparse_hashtable { - private: - typedef typename Alloc::template rebind::other value_alloc_type; - - public: - typedef Key key_type; - typedef Value value_type; - typedef HashFcn hasher; - typedef EqualKey key_equal; - typedef Alloc allocator_type; - - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::const_reference const_reference; - typedef typename value_alloc_type::pointer pointer; - typedef typename value_alloc_type::const_pointer const_pointer; - typedef sparse_hashtable_iterator - iterator; - - typedef sparse_hashtable_const_iterator - const_iterator; - - typedef sparse_hashtable_destructive_iterator - destructive_iterator; - - // These come from tr1. For us they're the same as regular iterators. - typedef iterator local_iterator; - typedef const_iterator const_local_iterator; - - // How full we let the table get before we resize, by default. - // Knuth says .8 is good -- higher causes us to probe too much, - // though it saves memory. - static const int HT_OCCUPANCY_PCT; // = 80 (out of 100); - - // How empty we let the table get before we resize lower, by default. - // (0.0 means never resize lower.) - // It should be less than OCCUPANCY_PCT / 2 or we thrash resizing - static const int HT_EMPTY_PCT; // = 0.4 * HT_OCCUPANCY_PCT; - - // Minimum size we're willing to let hashtables be. - // Must be a power of two, and at least 4. - // Note, however, that for a given hashtable, the initial size is a - // function of the first constructor arg, and may be >HT_MIN_BUCKETS. - static const size_type HT_MIN_BUCKETS = 4; - - // By default, if you don't specify a hashtable size at - // construction-time, we use this size. Must be a power of two, and - // at least HT_MIN_BUCKETS. - static const size_type HT_DEFAULT_STARTING_BUCKETS = 32; - - // ITERATOR FUNCTIONS - iterator begin() { return iterator(this, table.nonempty_begin(), - table.nonempty_end()); } - iterator end() { return iterator(this, table.nonempty_end(), - table.nonempty_end()); } - const_iterator begin() const { return const_iterator(this, - table.nonempty_begin(), - table.nonempty_end()); } - const_iterator end() const { return const_iterator(this, - table.nonempty_end(), - table.nonempty_end()); } - - // These come from tr1 unordered_map. They iterate over 'bucket' n. - // For sparsehashtable, we could consider each 'group' to be a bucket, - // I guess, but I don't really see the point. We'll just consider - // bucket n to be the n-th element of the sparsetable, if it's occupied, - // or some empty element, otherwise. - local_iterator begin(size_type i) { - if (table.test(i)) - return local_iterator(this, table.get_iter(i), table.nonempty_end()); - else - return local_iterator(this, table.nonempty_end(), table.nonempty_end()); - } - local_iterator end(size_type i) { - local_iterator it = begin(i); - if (table.test(i) && !test_deleted(i)) - ++it; - return it; - } - const_local_iterator begin(size_type i) const { - if (table.test(i)) - return const_local_iterator(this, table.get_iter(i), - table.nonempty_end()); - else - return const_local_iterator(this, table.nonempty_end(), - table.nonempty_end()); - } - const_local_iterator end(size_type i) const { - const_local_iterator it = begin(i); - if (table.test(i) && !test_deleted(i)) - ++it; - return it; - } - - // This is used when resizing - destructive_iterator destructive_begin() { - return destructive_iterator(this, table.destructive_begin(), - table.destructive_end()); - } - destructive_iterator destructive_end() { - return destructive_iterator(this, table.destructive_end(), - table.destructive_end()); - } - - - // ACCESSOR FUNCTIONS for the things we templatize on, basically - hasher hash_funct() const { return settings; } - key_equal key_eq() const { return key_info; } - allocator_type get_allocator() const { return table.get_allocator(); } - - // Accessor function for statistics gathering. - int num_table_copies() const { return settings.num_ht_copies(); } - - private: - // We need to copy values when we set the special marker for deleted - // elements, but, annoyingly, we can't just use the copy assignment - // operator because value_type might not be assignable (it's often - // pair). We use explicit destructor invocation and - // placement new to get around this. Arg. - void set_value(pointer dst, const_reference src) { - dst->~value_type(); // delete the old value, if any - new(dst) value_type(src); - } - - // This is used as a tag for the copy constructor, saying to destroy its - // arg We have two ways of destructively copying: with potentially growing - // the hashtable as we copy, and without. To make sure the outside world - // can't do a destructive copy, we make the typename private. - enum MoveDontCopyT {MoveDontCopy, MoveDontGrow}; - - // DELETE HELPER FUNCTIONS - // This lets the user describe a key that will indicate deleted - // table entries. This key should be an "impossible" entry -- - // if you try to insert it for real, you won't be able to retrieve it! - // (NB: while you pass in an entire value, only the key part is looked - // at. This is just because I don't know how to assign just a key.) - private: - void squash_deleted() { // gets rid of any deleted entries we have - if ( num_deleted ) { // get rid of deleted before writing - sparse_hashtable tmp(MoveDontGrow, *this); - swap(tmp); // now we are tmp - } - assert(num_deleted == 0); - } - - // Test if the given key is the deleted indicator. Requires - // num_deleted > 0, for correctness of read(), and because that - // guarantees that key_info.delkey is valid. - bool test_deleted_key(const key_type& key) const { - assert(num_deleted > 0); - return equals(key_info.delkey, key); - } - - public: - void set_deleted_key(const key_type &key) { - // It's only safe to change what "deleted" means if we purge deleted guys - squash_deleted(); - settings.set_use_deleted(true); - key_info.delkey = key; - } - void clear_deleted_key() { - squash_deleted(); - settings.set_use_deleted(false); - } - key_type deleted_key() const { - assert(settings.use_deleted() - && "Must set deleted key before calling deleted_key"); - return key_info.delkey; - } - - // These are public so the iterators can use them - // True if the item at position bucknum is "deleted" marker - bool test_deleted(size_type bucknum) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && table.test(bucknum) && - test_deleted_key(get_key(table.unsafe_get(bucknum))); - } - bool test_deleted(const iterator &it) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(*it)); - } - bool test_deleted(const const_iterator &it) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(*it)); - } - bool test_deleted(const destructive_iterator &it) const { - // Invariant: !use_deleted() implies num_deleted is 0. - assert(settings.use_deleted() || num_deleted == 0); - return num_deleted > 0 && test_deleted_key(get_key(*it)); - } - - private: - void check_use_deleted(const char* caller) { - (void)caller; // could log it if the assert failed - assert(settings.use_deleted()); - } - - // Set it so test_deleted is true. true if object didn't used to be deleted. - // TODO(csilvers): make these private (also in densehashtable.h) - bool set_deleted(iterator &it) { - check_use_deleted("set_deleted()"); - bool retval = !test_deleted(it); - // &* converts from iterator to value-type. - set_key(&(*it), key_info.delkey); - return retval; - } - // Set it so test_deleted is false. true if object used to be deleted. - bool clear_deleted(iterator &it) { - check_use_deleted("clear_deleted()"); - // Happens automatically when we assign something else in its place. - return test_deleted(it); - } - - // We also allow to set/clear the deleted bit on a const iterator. - // We allow a const_iterator for the same reason you can delete a - // const pointer: it's convenient, and semantically you can't use - // 'it' after it's been deleted anyway, so its const-ness doesn't - // really matter. - bool set_deleted(const_iterator &it) { - check_use_deleted("set_deleted()"); - bool retval = !test_deleted(it); - set_key(const_cast(&(*it)), key_info.delkey); - return retval; - } - // Set it so test_deleted is false. true if object used to be deleted. - bool clear_deleted(const_iterator &it) { - check_use_deleted("clear_deleted()"); - return test_deleted(it); - } - - // FUNCTIONS CONCERNING SIZE - public: - size_type size() const { return table.num_nonempty() - num_deleted; } - size_type max_size() const { return table.max_size(); } - bool empty() const { return size() == 0; } - size_type bucket_count() const { return table.size(); } - size_type max_bucket_count() const { return max_size(); } - // These are tr1 methods. Their idea of 'bucket' doesn't map well to - // what we do. We just say every bucket has 0 or 1 items in it. - size_type bucket_size(size_type i) const { - return begin(i) == end(i) ? 0 : 1; - } - - private: - // Because of the above, size_type(-1) is never legal; use it for errors - static const size_type ILLEGAL_BUCKET = size_type(-1); - - // Used after a string of deletes. Returns true if we actually shrunk. - // TODO(csilvers): take a delta so we can take into account inserts - // done after shrinking. Maybe make part of the Settings class? - bool maybe_shrink() { - assert(table.num_nonempty() >= num_deleted); - assert((bucket_count() & (bucket_count()-1)) == 0); // is a power of two - assert(bucket_count() >= HT_MIN_BUCKETS); - bool retval = false; - - // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS, - // we'll never shrink until you get relatively big, and we'll never - // shrink below HT_DEFAULT_STARTING_BUCKETS. Otherwise, something - // like "dense_hash_set x; x.insert(4); x.erase(4);" will - // shrink us down to HT_MIN_BUCKETS buckets, which is too small. - const size_type num_remain = table.num_nonempty() - num_deleted; - const size_type shrink_threshold = settings.shrink_threshold(); - if (shrink_threshold > 0 && num_remain < shrink_threshold && - bucket_count() > HT_DEFAULT_STARTING_BUCKETS) { - const float shrink_factor = settings.shrink_factor(); - size_type sz = bucket_count() / 2; // find how much we should shrink - while (sz > HT_DEFAULT_STARTING_BUCKETS && - num_remain < static_cast(sz * shrink_factor)) { - sz /= 2; // stay a power of 2 - } - sparse_hashtable tmp(MoveDontCopy, *this, sz); - swap(tmp); // now we are tmp - retval = true; - } - settings.set_consider_shrink(false); // because we just considered it - return retval; - } - - // We'll let you resize a hashtable -- though this makes us copy all! - // When you resize, you say, "make it big enough for this many more elements" - // Returns true if we actually resized, false if size was already ok. - bool resize_delta(size_type delta) { - bool did_resize = false; - if ( settings.consider_shrink() ) { // see if lots of deletes happened - if ( maybe_shrink() ) - did_resize = true; - } - if (table.num_nonempty() >= - (std::numeric_limits::max)() - delta) { - throw std::length_error("resize overflow"); - } - if ( bucket_count() >= HT_MIN_BUCKETS && - (table.num_nonempty() + delta) <= settings.enlarge_threshold() ) - return did_resize; // we're ok as we are - - // Sometimes, we need to resize just to get rid of all the - // "deleted" buckets that are clogging up the hashtable. So when - // deciding whether to resize, count the deleted buckets (which - // are currently taking up room). But later, when we decide what - // size to resize to, *don't* count deleted buckets, since they - // get discarded during the resize. - const size_type needed_size = - settings.min_buckets(table.num_nonempty() + delta, 0); - if ( needed_size <= bucket_count() ) // we have enough buckets - return did_resize; - - size_type resize_to = - settings.min_buckets(table.num_nonempty() - num_deleted + delta, - bucket_count()); - if (resize_to < needed_size && // may double resize_to - resize_to < (std::numeric_limits::max)() / 2) { - // This situation means that we have enough deleted elements, - // that once we purge them, we won't actually have needed to - // grow. But we may want to grow anyway: if we just purge one - // element, say, we'll have to grow anyway next time we - // insert. Might as well grow now, since we're already going - // through the trouble of copying (in order to purge the - // deleted elements). - const size_type target = - static_cast(settings.shrink_size(resize_to*2)); - if (table.num_nonempty() - num_deleted + delta >= target) { - // Good, we won't be below the shrink threshhold even if we double. - resize_to *= 2; - } - } - - sparse_hashtable tmp(MoveDontCopy, *this, resize_to); - swap(tmp); // now we are tmp - return true; - } - - // Used to actually do the rehashing when we grow/shrink a hashtable - void copy_from(const sparse_hashtable &ht, size_type min_buckets_wanted) { - clear(); // clear table, set num_deleted to 0 - - // If we need to change the size of our table, do it now - const size_type resize_to = - settings.min_buckets(ht.size(), min_buckets_wanted); - if ( resize_to > bucket_count() ) { // we don't have enough buckets - table.resize(resize_to); // sets the number of buckets - settings.reset_thresholds(bucket_count()); - } - - // We use a normal iterator to get non-deleted bcks from ht - // We could use insert() here, but since we know there are - // no duplicates and no deleted items, we can be more efficient - assert((bucket_count() & (bucket_count()-1)) == 0); // a power of two - for ( const_iterator it = ht.begin(); it != ht.end(); ++it ) { - size_type num_probes = 0; // how many times we've probed - size_type bucknum; - const size_type bucket_count_minus_one = bucket_count() - 1; - for (bucknum = hash(get_key(*it)) & bucket_count_minus_one; - table.test(bucknum); // not empty - bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) { - ++num_probes; - assert(num_probes < bucket_count() - && "Hashtable is full: an error in key_equal<> or hash<>"); - } - table.set(bucknum, *it); // copies the value to here - } - settings.inc_num_ht_copies(); - } - - // Implementation is like copy_from, but it destroys the table of the - // "from" guy by freeing sparsetable memory as we iterate. This is - // useful in resizing, since we're throwing away the "from" guy anyway. - void move_from(MoveDontCopyT mover, sparse_hashtable &ht, - size_type min_buckets_wanted) { - clear(); // clear table, set num_deleted to 0 - - // If we need to change the size of our table, do it now - size_type resize_to; - if ( mover == MoveDontGrow ) - resize_to = ht.bucket_count(); // keep same size as old ht - else // MoveDontCopy - resize_to = settings.min_buckets(ht.size(), min_buckets_wanted); - if ( resize_to > bucket_count() ) { // we don't have enough buckets - table.resize(resize_to); // sets the number of buckets - settings.reset_thresholds(bucket_count()); - } - - // We use a normal iterator to get non-deleted bcks from ht - // We could use insert() here, but since we know there are - // no duplicates and no deleted items, we can be more efficient - assert( (bucket_count() & (bucket_count()-1)) == 0); // a power of two - // THIS IS THE MAJOR LINE THAT DIFFERS FROM COPY_FROM(): - for ( destructive_iterator it = ht.destructive_begin(); - it != ht.destructive_end(); ++it ) { - size_type num_probes = 0; // how many times we've probed - size_type bucknum; - for ( bucknum = hash(get_key(*it)) & (bucket_count()-1); // h % buck_cnt - table.test(bucknum); // not empty - bucknum = (bucknum + JUMP_(key, num_probes)) & (bucket_count()-1) ) { - ++num_probes; - assert(num_probes < bucket_count() - && "Hashtable is full: an error in key_equal<> or hash<>"); - } - table.set(bucknum, *it); // copies the value to here - } - settings.inc_num_ht_copies(); - } - - - // Required by the spec for hashed associative container - public: - // Though the docs say this should be num_buckets, I think it's much - // more useful as num_elements. As a special feature, calling with - // req_elements==0 will cause us to shrink if we can, saving space. - void resize(size_type req_elements) { // resize to this or larger - if ( settings.consider_shrink() || req_elements == 0 ) - maybe_shrink(); - if ( req_elements > table.num_nonempty() ) // we only grow - resize_delta(req_elements - table.num_nonempty()); - } - - // Get and change the value of shrink_factor and enlarge_factor. The - // description at the beginning of this file explains how to choose - // the values. Setting the shrink parameter to 0.0 ensures that the - // table never shrinks. - void get_resizing_parameters(float* shrink, float* grow) const { - *shrink = settings.shrink_factor(); - *grow = settings.enlarge_factor(); - } - void set_resizing_parameters(float shrink, float grow) { - settings.set_resizing_parameters(shrink, grow); - settings.reset_thresholds(bucket_count()); - } - - // CONSTRUCTORS -- as required by the specs, we take a size, - // but also let you specify a hashfunction, key comparator, - // and key extractor. We also define a copy constructor and =. - // DESTRUCTOR -- the default is fine, surprisingly. - explicit sparse_hashtable(size_type expected_max_items_in_table = 0, - const HashFcn& hf = HashFcn(), - const EqualKey& eql = EqualKey(), - const ExtractKey& ext = ExtractKey(), - const SetKey& set = SetKey(), - const Alloc& alloc = Alloc()) - : settings(hf), - key_info(ext, set, eql), - num_deleted(0), - table((expected_max_items_in_table == 0 - ? HT_DEFAULT_STARTING_BUCKETS - : settings.min_buckets(expected_max_items_in_table, 0)), - alloc) { - settings.reset_thresholds(bucket_count()); - } - - // As a convenience for resize(), we allow an optional second argument - // which lets you make this new hashtable a different size than ht. - // We also provide a mechanism of saying you want to "move" the ht argument - // into us instead of copying. - sparse_hashtable(const sparse_hashtable& ht, - size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS) - : settings(ht.settings), - key_info(ht.key_info), - num_deleted(0), - table(0, ht.get_allocator()) { - settings.reset_thresholds(bucket_count()); - copy_from(ht, min_buckets_wanted); // copy_from() ignores deleted entries - } - sparse_hashtable(MoveDontCopyT mover, sparse_hashtable& ht, - size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS) - : settings(ht.settings), - key_info(ht.key_info), - num_deleted(0), - table(0, ht.get_allocator()) { - settings.reset_thresholds(bucket_count()); - move_from(mover, ht, min_buckets_wanted); // ignores deleted entries - } - - sparse_hashtable& operator= (const sparse_hashtable& ht) { - if (&ht == this) return *this; // don't copy onto ourselves - settings = ht.settings; - key_info = ht.key_info; - num_deleted = ht.num_deleted; - // copy_from() calls clear and sets num_deleted to 0 too - copy_from(ht, HT_MIN_BUCKETS); - // we purposefully don't copy the allocator, which may not be copyable - return *this; - } - - // Many STL algorithms use swap instead of copy constructors - void swap(sparse_hashtable& ht) { - std::swap(settings, ht.settings); - std::swap(key_info, ht.key_info); - std::swap(num_deleted, ht.num_deleted); - table.swap(ht.table); - settings.reset_thresholds(bucket_count()); // also resets consider_shrink - ht.settings.reset_thresholds(ht.bucket_count()); - // we purposefully don't swap the allocator, which may not be swap-able - } - - // It's always nice to be able to clear a table without deallocating it - void clear() { - if (!empty() || (num_deleted != 0)) { - table.clear(); - } - settings.reset_thresholds(bucket_count()); - num_deleted = 0; - } - - // LOOKUP ROUTINES - private: - // Returns a pair of positions: 1st where the object is, 2nd where - // it would go if you wanted to insert it. 1st is ILLEGAL_BUCKET - // if object is not found; 2nd is ILLEGAL_BUCKET if it is. - // Note: because of deletions where-to-insert is not trivial: it's the - // first deleted bucket we see, as long as we don't find the key later - std::pair find_position(const key_type &key) const { - size_type num_probes = 0; // how many times we've probed - const size_type bucket_count_minus_one = bucket_count() - 1; - size_type bucknum = hash(key) & bucket_count_minus_one; - size_type insert_pos = ILLEGAL_BUCKET; // where we would insert - SPARSEHASH_STAT_UPDATE(total_lookups += 1); - while ( 1 ) { // probe until something happens - if ( !table.test(bucknum) ) { // bucket is empty - SPARSEHASH_STAT_UPDATE(total_probes += num_probes); - if ( insert_pos == ILLEGAL_BUCKET ) // found no prior place to insert - return std::pair(ILLEGAL_BUCKET, bucknum); - else - return std::pair(ILLEGAL_BUCKET, insert_pos); - - } else if ( test_deleted(bucknum) ) {// keep searching, but mark to insert - if ( insert_pos == ILLEGAL_BUCKET ) - insert_pos = bucknum; - - } else if ( equals(key, get_key(table.unsafe_get(bucknum))) ) { - SPARSEHASH_STAT_UPDATE(total_probes += num_probes); - return std::pair(bucknum, ILLEGAL_BUCKET); - } - ++num_probes; // we're doing another probe - bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one; - assert(num_probes < bucket_count() - && "Hashtable is full: an error in key_equal<> or hash<>"); - } - } - - public: - - iterator find(const key_type& key) { - if ( size() == 0 ) return end(); - std::pair pos = find_position(key); - if ( pos.first == ILLEGAL_BUCKET ) // alas, not there - return end(); - else - return iterator(this, table.get_iter(pos.first), table.nonempty_end()); - } - - const_iterator find(const key_type& key) const { - if ( size() == 0 ) return end(); - std::pair pos = find_position(key); - if ( pos.first == ILLEGAL_BUCKET ) // alas, not there - return end(); - else - return const_iterator(this, - table.get_iter(pos.first), table.nonempty_end()); - } - - // This is a tr1 method: the bucket a given key is in, or what bucket - // it would be put in, if it were to be inserted. Shrug. - size_type bucket(const key_type& key) const { - std::pair pos = find_position(key); - return pos.first == ILLEGAL_BUCKET ? pos.second : pos.first; - } - - // Counts how many elements have key key. For maps, it's either 0 or 1. - size_type count(const key_type &key) const { - std::pair pos = find_position(key); - return pos.first == ILLEGAL_BUCKET ? 0 : 1; - } - - // Likewise, equal_range doesn't really make sense for us. Oh well. - std::pair equal_range(const key_type& key) { - iterator pos = find(key); // either an iterator or end - if (pos == end()) { - return std::pair(pos, pos); - } else { - const iterator startpos = pos++; - return std::pair(startpos, pos); - } - } - std::pair equal_range(const key_type& key) - const { - const_iterator pos = find(key); // either an iterator or end - if (pos == end()) { - return std::pair(pos, pos); - } else { - const const_iterator startpos = pos++; - return std::pair(startpos, pos); - } - } - - - // INSERTION ROUTINES - private: - // Private method used by insert_noresize and find_or_insert. - iterator insert_at(const_reference obj, size_type pos) { - if (size() >= max_size()) { - throw std::length_error("insert overflow"); - } - if ( test_deleted(pos) ) { // just replace if it's been deleted - // The set() below will undelete this object. We just worry about stats - assert(num_deleted > 0); - --num_deleted; // used to be, now it isn't - } - table.set(pos, obj); - return iterator(this, table.get_iter(pos), table.nonempty_end()); - } - - // If you know *this is big enough to hold obj, use this routine - std::pair insert_noresize(const_reference obj) { - // First, double-check we're not inserting delkey - assert((!settings.use_deleted() || !equals(get_key(obj), key_info.delkey)) - && "Inserting the deleted key"); - const std::pair pos = find_position(get_key(obj)); - if ( pos.first != ILLEGAL_BUCKET) { // object was already there - return std::pair(iterator(this, table.get_iter(pos.first), - table.nonempty_end()), - false); // false: we didn't insert - } else { // pos.second says where to put it - return std::pair(insert_at(obj, pos.second), true); - } - } - - // Specializations of insert(it, it) depending on the power of the iterator: - // (1) Iterator supports operator-, resize before inserting - template - void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) { - size_t dist = std::distance(f, l); - if (dist >= (std::numeric_limits::max)()) { - throw std::length_error("insert-range overflow"); - } - resize_delta(static_cast(dist)); - for ( ; dist > 0; --dist, ++f) { - insert_noresize(*f); - } - } - - // (2) Arbitrary iterator, can't tell how much to resize - template - void insert(InputIterator f, InputIterator l, std::input_iterator_tag) { - for ( ; f != l; ++f) - insert(*f); - } - - public: - // This is the normal insert routine, used by the outside world - std::pair insert(const_reference obj) { - resize_delta(1); // adding an object, grow if need be - return insert_noresize(obj); - } - - // When inserting a lot at a time, we specialize on the type of iterator - template - void insert(InputIterator f, InputIterator l) { - // specializes on iterator type - insert(f, l, - typename std::iterator_traits::iterator_category()); - } - - // DefaultValue is a functor that takes a key and returns a value_type - // representing the default value to be inserted if none is found. - template - value_type& find_or_insert(const key_type& key) { - // First, double-check we're not inserting delkey - assert((!settings.use_deleted() || !equals(key, key_info.delkey)) - && "Inserting the deleted key"); - const std::pair pos = find_position(key); - DefaultValue default_value; - if ( pos.first != ILLEGAL_BUCKET) { // object was already there - return *table.get_iter(pos.first); - } else if (resize_delta(1)) { // needed to rehash to make room - // Since we resized, we can't use pos, so recalculate where to insert. - return *insert_noresize(default_value(key)).first; - } else { // no need to rehash, insert right here - return *insert_at(default_value(key), pos.second); - } - } - - // DELETION ROUTINES - size_type erase(const key_type& key) { - // First, double-check we're not erasing delkey. - assert((!settings.use_deleted() || !equals(key, key_info.delkey)) - && "Erasing the deleted key"); - assert(!settings.use_deleted() || !equals(key, key_info.delkey)); - const_iterator pos = find(key); // shrug: shouldn't need to be const - if ( pos != end() ) { - assert(!test_deleted(pos)); // or find() shouldn't have returned it - set_deleted(pos); - ++num_deleted; - // will think about shrink after next insert - settings.set_consider_shrink(true); - return 1; // because we deleted one thing - } else { - return 0; // because we deleted nothing - } - } - - // We return the iterator past the deleted item. - void erase(iterator pos) { - if ( pos == end() ) return; // sanity check - if ( set_deleted(pos) ) { // true if object has been newly deleted - ++num_deleted; - // will think about shrink after next insert - settings.set_consider_shrink(true); - } - } - - void erase(iterator f, iterator l) { - for ( ; f != l; ++f) { - if ( set_deleted(f) ) // should always be true - ++num_deleted; - } - // will think about shrink after next insert - settings.set_consider_shrink(true); - } - - // We allow you to erase a const_iterator just like we allow you to - // erase an iterator. This is in parallel to 'delete': you can delete - // a const pointer just like a non-const pointer. The logic is that - // you can't use the object after it's erased anyway, so it doesn't matter - // if it's const or not. - void erase(const_iterator pos) { - if ( pos == end() ) return; // sanity check - if ( set_deleted(pos) ) { // true if object has been newly deleted - ++num_deleted; - // will think about shrink after next insert - settings.set_consider_shrink(true); - } - } - void erase(const_iterator f, const_iterator l) { - for ( ; f != l; ++f) { - if ( set_deleted(f) ) // should always be true - ++num_deleted; - } - // will think about shrink after next insert - settings.set_consider_shrink(true); - } - - - // COMPARISON - bool operator==(const sparse_hashtable& ht) const { - if (size() != ht.size()) { - return false; - } else if (this == &ht) { - return true; - } else { - // Iterate through the elements in "this" and see if the - // corresponding element is in ht - for ( const_iterator it = begin(); it != end(); ++it ) { - const_iterator it2 = ht.find(get_key(*it)); - if ((it2 == ht.end()) || (*it != *it2)) { - return false; - } - } - return true; - } - } - bool operator!=(const sparse_hashtable& ht) const { - return !(*this == ht); - } - - - // I/O - // We support reading and writing hashtables to disk. NOTE that - // this only stores the hashtable metadata, not the stuff you've - // actually put in the hashtable! Alas, since I don't know how to - // write a hasher or key_equal, you have to make sure everything - // but the table is the same. We compact before writing. - // - // The OUTPUT type needs to support a Write() operation. File and - // OutputBuffer are appropriate types to pass in. - // - // The INPUT type needs to support a Read() operation. File and - // InputBuffer are appropriate types to pass in. - template - bool write_metadata(OUTPUT *fp) { - squash_deleted(); // so we don't have to worry about delkey - return table.write_metadata(fp); - } - - template - bool read_metadata(INPUT *fp) { - num_deleted = 0; // since we got rid before writing - const bool result = table.read_metadata(fp); - settings.reset_thresholds(bucket_count()); - return result; - } - - // Only meaningful if value_type is a POD. - template - bool write_nopointer_data(OUTPUT *fp) { - return table.write_nopointer_data(fp); - } - - // Only meaningful if value_type is a POD. - template - bool read_nopointer_data(INPUT *fp) { - return table.read_nopointer_data(fp); - } - - // INPUT and OUTPUT must be either a FILE, *or* a C++ stream - // (istream, ostream, etc) *or* a class providing - // Read(void*, size_t) and Write(const void*, size_t) - // (respectively), which writes a buffer into a stream - // (which the INPUT/OUTPUT instance presumably owns). - - typedef sparsehash_internal::pod_serializer NopointerSerializer; - - // ValueSerializer: a functor. operator()(OUTPUT*, const value_type&) - template - bool serialize(ValueSerializer serializer, OUTPUT *fp) { - squash_deleted(); // so we don't have to worry about delkey - return table.serialize(serializer, fp); - } - - // ValueSerializer: a functor. operator()(INPUT*, value_type*) - template - bool unserialize(ValueSerializer serializer, INPUT *fp) { - num_deleted = 0; // since we got rid before writing - const bool result = table.unserialize(serializer, fp); - settings.reset_thresholds(bucket_count()); - return result; - } - - private: - // Table is the main storage class. - typedef sparsetable Table; - - // Package templated functors with the other types to eliminate memory - // needed for storing these zero-size operators. Since ExtractKey and - // hasher's operator() might have the same function signature, they - // must be packaged in different classes. - struct Settings : - sparsehash_internal::sh_hashtable_settings { - explicit Settings(const hasher& hf) - : sparsehash_internal::sh_hashtable_settings( - hf, HT_OCCUPANCY_PCT / 100.0f, HT_EMPTY_PCT / 100.0f) {} - }; - - // KeyInfo stores delete key and packages zero-size functors: - // ExtractKey and SetKey. - class KeyInfo : public ExtractKey, public SetKey, public EqualKey { - public: - KeyInfo(const ExtractKey& ek, const SetKey& sk, const EqualKey& eq) - : ExtractKey(ek), - SetKey(sk), - EqualKey(eq) { - } - // We want to return the exact same type as ExtractKey: Key or const Key& - typename ExtractKey::result_type get_key(const_reference v) const { - return ExtractKey::operator()(v); - } - void set_key(pointer v, const key_type& k) const { - SetKey::operator()(v, k); - } - bool equals(const key_type& a, const key_type& b) const { - return EqualKey::operator()(a, b); - } - - // Which key marks deleted entries. - // TODO(csilvers): make a pointer, and get rid of use_deleted (benchmark!) - typename base::remove_const::type delkey; - }; - - // Utility functions to access the templated operators - size_type hash(const key_type& v) const { - return settings.hash(v); - } - bool equals(const key_type& a, const key_type& b) const { - return key_info.equals(a, b); - } - typename ExtractKey::result_type get_key(const_reference v) const { - return key_info.get_key(v); - } - void set_key(pointer v, const key_type& k) const { - key_info.set_key(v, k); - } - - private: - // Actual data - Settings settings; - KeyInfo key_info; - size_type num_deleted; // how many occupied buckets are marked deleted - Table table; // holds num_buckets and num_elements too -}; - - -// We need a global swap as well -template -inline void swap(sparse_hashtable &x, - sparse_hashtable &y) { - x.swap(y); -} - -#undef JUMP_ - -template -const typename sparse_hashtable::size_type - sparse_hashtable::ILLEGAL_BUCKET; - -// How full we let the table get before we resize. Knuth says .8 is -// good -- higher causes us to probe too much, though saves memory -template -const int sparse_hashtable::HT_OCCUPANCY_PCT = 80; - -// How empty we let the table get before we resize lower. -// It should be less than OCCUPANCY_PCT / 2 or we thrash resizing -template -const int sparse_hashtable::HT_EMPTY_PCT - = static_cast(0.4 * - sparse_hashtable::HT_OCCUPANCY_PCT); - -_END_GOOGLE_NAMESPACE_ - -#endif /* _SPARSEHASHTABLE_H_ */ diff --git a/contrib/libsparsehash/sparsehash/sparse_hash_map b/contrib/libsparsehash/sparsehash/sparse_hash_map deleted file mode 100644 index 1687a8b11c6..00000000000 --- a/contrib/libsparsehash/sparsehash/sparse_hash_map +++ /dev/null @@ -1,363 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// This is just a very thin wrapper over sparsehashtable.h, just -// like sgi stl's stl_hash_map is a very thin wrapper over -// stl_hashtable. The major thing we define is operator[], because -// we have a concept of a data_type which stl_hashtable doesn't -// (it only has a key and a value). -// -// We adhere mostly to the STL semantics for hash-map. One important -// exception is that insert() may invalidate iterators entirely -- STL -// semantics are that insert() may reorder iterators, but they all -// still refer to something valid in the hashtable. Not so for us. -// Likewise, insert() may invalidate pointers into the hashtable. -// (Whether insert invalidates iterators and pointers depends on -// whether it results in a hashtable resize). On the plus side, -// delete() doesn't invalidate iterators or pointers at all, or even -// change the ordering of elements. -// -// Here are a few "power user" tips: -// -// 1) set_deleted_key(): -// Unlike STL's hash_map, if you want to use erase() you -// *must* call set_deleted_key() after construction. -// -// 2) resize(0): -// When an item is deleted, its memory isn't freed right -// away. This is what allows you to iterate over a hashtable -// and call erase() without invalidating the iterator. -// To force the memory to be freed, call resize(0). -// For tr1 compatibility, this can also be called as rehash(0). -// -// 3) min_load_factor(0.0) -// Setting the minimum load factor to 0.0 guarantees that -// the hash table will never shrink. -// -// Roughly speaking: -// (1) dense_hash_map: fastest, uses the most memory unless entries are small -// (2) sparse_hash_map: slowest, uses the least memory -// (3) hash_map / unordered_map (STL): in the middle -// -// Typically I use sparse_hash_map when I care about space and/or when -// I need to save the hashtable on disk. I use hash_map otherwise. I -// don't personally use dense_hash_map ever; some people use it for -// small maps with lots of lookups. -// -// - dense_hash_map has, typically, about 78% memory overhead (if your -// data takes up X bytes, the hash_map uses .78X more bytes in overhead). -// - sparse_hash_map has about 4 bits overhead per entry. -// - sparse_hash_map can be 3-7 times slower than the others for lookup and, -// especially, inserts. See time_hash_map.cc for details. -// -// See /usr/(local/)?doc/sparsehash-*/sparse_hash_map.html -// for information about how to use this class. - -#ifndef _SPARSE_HASH_MAP_H_ -#define _SPARSE_HASH_MAP_H_ - -#include -#include // needed by stl_alloc -#include // for equal_to<>, select1st<>, etc -#include // for alloc -#include // for pair<> -#include -#include // IWYU pragma: export -#include HASH_FUN_H // for hash<> -_START_GOOGLE_NAMESPACE_ - -template , // defined in sparseconfig.h - class EqualKey = std::equal_to, - class Alloc = libc_allocator_with_realloc > > -class sparse_hash_map { - private: - // Apparently select1st is not stl-standard, so we define our own - struct SelectKey { - typedef const Key& result_type; - const Key& operator()(const std::pair& p) const { - return p.first; - } - }; - struct SetKey { - void operator()(std::pair* value, const Key& new_key) const { - *const_cast(&value->first) = new_key; - // It would be nice to clear the rest of value here as well, in - // case it's taking up a lot of memory. We do this by clearing - // the value. This assumes T has a zero-arg constructor! - value->second = T(); - } - }; - // For operator[]. - struct DefaultValue { - std::pair operator()(const Key& key) { - return std::make_pair(key, T()); - } - }; - - // The actual data - typedef sparse_hashtable, Key, HashFcn, SelectKey, - SetKey, EqualKey, Alloc> ht; - ht rep; - - public: - typedef typename ht::key_type key_type; - typedef T data_type; - typedef T mapped_type; - typedef typename ht::value_type value_type; - typedef typename ht::hasher hasher; - typedef typename ht::key_equal key_equal; - typedef Alloc allocator_type; - - typedef typename ht::size_type size_type; - typedef typename ht::difference_type difference_type; - typedef typename ht::pointer pointer; - typedef typename ht::const_pointer const_pointer; - typedef typename ht::reference reference; - typedef typename ht::const_reference const_reference; - - typedef typename ht::iterator iterator; - typedef typename ht::const_iterator const_iterator; - typedef typename ht::local_iterator local_iterator; - typedef typename ht::const_local_iterator const_local_iterator; - - // Iterator functions - iterator begin() { return rep.begin(); } - iterator end() { return rep.end(); } - const_iterator begin() const { return rep.begin(); } - const_iterator end() const { return rep.end(); } - - // These come from tr1's unordered_map. For us, a bucket has 0 or 1 elements. - local_iterator begin(size_type i) { return rep.begin(i); } - local_iterator end(size_type i) { return rep.end(i); } - const_local_iterator begin(size_type i) const { return rep.begin(i); } - const_local_iterator end(size_type i) const { return rep.end(i); } - - // Accessor functions - allocator_type get_allocator() const { return rep.get_allocator(); } - hasher hash_funct() const { return rep.hash_funct(); } - hasher hash_function() const { return hash_funct(); } - key_equal key_eq() const { return rep.key_eq(); } - - - // Constructors - explicit sparse_hash_map(size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(), alloc) { - } - - template - sparse_hash_map(InputIterator f, InputIterator l, - size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(), alloc) { - rep.insert(f, l); - } - // We use the default copy constructor - // We use the default operator=() - // We use the default destructor - - void clear() { rep.clear(); } - void swap(sparse_hash_map& hs) { rep.swap(hs.rep); } - - - // Functions concerning size - size_type size() const { return rep.size(); } - size_type max_size() const { return rep.max_size(); } - bool empty() const { return rep.empty(); } - size_type bucket_count() const { return rep.bucket_count(); } - size_type max_bucket_count() const { return rep.max_bucket_count(); } - - // These are tr1 methods. bucket() is the bucket the key is or would be in. - size_type bucket_size(size_type i) const { return rep.bucket_size(i); } - size_type bucket(const key_type& key) const { return rep.bucket(key); } - float load_factor() const { - return size() * 1.0f / bucket_count(); - } - float max_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return grow; - } - void max_load_factor(float new_grow) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(shrink, new_grow); - } - // These aren't tr1 methods but perhaps ought to be. - float min_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return shrink; - } - void min_load_factor(float new_shrink) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(new_shrink, grow); - } - // Deprecated; use min_load_factor() or max_load_factor() instead. - void set_resizing_parameters(float shrink, float grow) { - rep.set_resizing_parameters(shrink, grow); - } - - void resize(size_type hint) { rep.resize(hint); } - void rehash(size_type hint) { resize(hint); } // the tr1 name - - // Lookup routines - iterator find(const key_type& key) { return rep.find(key); } - const_iterator find(const key_type& key) const { return rep.find(key); } - - data_type& operator[](const key_type& key) { // This is our value-add! - // If key is in the hashtable, returns find(key)->second, - // otherwise returns insert(value_type(key, T()).first->second. - // Note it does not create an empty T unless the find fails. - return rep.template find_or_insert(key).second; - } - - size_type count(const key_type& key) const { return rep.count(key); } - - std::pair equal_range(const key_type& key) { - return rep.equal_range(key); - } - std::pair equal_range(const key_type& key) - const { - return rep.equal_range(key); - } - - // Insertion routines - std::pair insert(const value_type& obj) { - return rep.insert(obj); - } - template void insert(InputIterator f, InputIterator l) { - rep.insert(f, l); - } - void insert(const_iterator f, const_iterator l) { - rep.insert(f, l); - } - // Required for std::insert_iterator; the passed-in iterator is ignored. - iterator insert(iterator, const value_type& obj) { - return insert(obj).first; - } - - // Deletion routines - // THESE ARE NON-STANDARD! I make you specify an "impossible" key - // value to identify deleted buckets. You can change the key as - // time goes on, or get rid of it entirely to be insert-only. - void set_deleted_key(const key_type& key) { - rep.set_deleted_key(key); - } - void clear_deleted_key() { rep.clear_deleted_key(); } - key_type deleted_key() const { return rep.deleted_key(); } - - // These are standard - size_type erase(const key_type& key) { return rep.erase(key); } - void erase(iterator it) { rep.erase(it); } - void erase(iterator f, iterator l) { rep.erase(f, l); } - - - // Comparison - bool operator==(const sparse_hash_map& hs) const { return rep == hs.rep; } - bool operator!=(const sparse_hash_map& hs) const { return rep != hs.rep; } - - - // I/O -- this is an add-on for writing metainformation to disk - // - // For maximum flexibility, this does not assume a particular - // file type (though it will probably be a FILE *). We just pass - // the fp through to rep. - - // If your keys and values are simple enough, you can pass this - // serializer to serialize()/unserialize(). "Simple enough" means - // value_type is a POD type that contains no pointers. Note, - // however, we don't try to normalize endianness. - typedef typename ht::NopointerSerializer NopointerSerializer; - - // serializer: a class providing operator()(OUTPUT*, const value_type&) - // (writing value_type to OUTPUT). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a - // pointer to a class providing size_t Write(const void*, size_t), - // which writes a buffer into a stream (which fp presumably - // owns) and returns the number of bytes successfully written. - // Note basic_ostream is not currently supported. - template - bool serialize(ValueSerializer serializer, OUTPUT* fp) { - return rep.serialize(serializer, fp); - } - - // serializer: a functor providing operator()(INPUT*, value_type*) - // (reading from INPUT and into value_type). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a - // pointer to a class providing size_t Read(void*, size_t), - // which reads into a buffer from a stream (which fp presumably - // owns) and returns the number of bytes successfully read. - // Note basic_istream is not currently supported. - // NOTE: Since value_type is std::pair, ValueSerializer - // may need to do a const cast in order to fill in the key. - // NOTE: if Key or T are not POD types, the serializer MUST use - // placement-new to initialize their values, rather than a normal - // equals-assignment or similar. (The value_type* passed into the - // serializer points to garbage memory.) - template - bool unserialize(ValueSerializer serializer, INPUT* fp) { - return rep.unserialize(serializer, fp); - } - - // The four methods below are DEPRECATED. - // Use serialize() and unserialize() for new code. - template - bool write_metadata(OUTPUT *fp) { return rep.write_metadata(fp); } - - template - bool read_metadata(INPUT *fp) { return rep.read_metadata(fp); } - - template - bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); } - - template - bool read_nopointer_data(INPUT *fp) { return rep.read_nopointer_data(fp); } -}; - -// We need a global swap as well -template -inline void swap(sparse_hash_map& hm1, - sparse_hash_map& hm2) { - hm1.swap(hm2); -} - -_END_GOOGLE_NAMESPACE_ - -#endif /* _SPARSE_HASH_MAP_H_ */ diff --git a/contrib/libsparsehash/sparsehash/sparse_hash_set b/contrib/libsparsehash/sparsehash/sparse_hash_set deleted file mode 100644 index ae4a97a62ca..00000000000 --- a/contrib/libsparsehash/sparsehash/sparse_hash_set +++ /dev/null @@ -1,338 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// This is just a very thin wrapper over sparsehashtable.h, just -// like sgi stl's stl_hash_set is a very thin wrapper over -// stl_hashtable. The major thing we define is operator[], because -// we have a concept of a data_type which stl_hashtable doesn't -// (it only has a key and a value). -// -// This is more different from sparse_hash_map than you might think, -// because all iterators for sets are const (you obviously can't -// change the key, and for sets there is no value). -// -// We adhere mostly to the STL semantics for hash-map. One important -// exception is that insert() may invalidate iterators entirely -- STL -// semantics are that insert() may reorder iterators, but they all -// still refer to something valid in the hashtable. Not so for us. -// Likewise, insert() may invalidate pointers into the hashtable. -// (Whether insert invalidates iterators and pointers depends on -// whether it results in a hashtable resize). On the plus side, -// delete() doesn't invalidate iterators or pointers at all, or even -// change the ordering of elements. -// -// Here are a few "power user" tips: -// -// 1) set_deleted_key(): -// Unlike STL's hash_map, if you want to use erase() you -// *must* call set_deleted_key() after construction. -// -// 2) resize(0): -// When an item is deleted, its memory isn't freed right -// away. This allows you to iterate over a hashtable, -// and call erase(), without invalidating the iterator. -// To force the memory to be freed, call resize(0). -// For tr1 compatibility, this can also be called as rehash(0). -// -// 3) min_load_factor(0.0) -// Setting the minimum load factor to 0.0 guarantees that -// the hash table will never shrink. -// -// Roughly speaking: -// (1) dense_hash_set: fastest, uses the most memory unless entries are small -// (2) sparse_hash_set: slowest, uses the least memory -// (3) hash_set / unordered_set (STL): in the middle -// -// Typically I use sparse_hash_set when I care about space and/or when -// I need to save the hashtable on disk. I use hash_set otherwise. I -// don't personally use dense_hash_set ever; some people use it for -// small sets with lots of lookups. -// -// - dense_hash_set has, typically, about 78% memory overhead (if your -// data takes up X bytes, the hash_set uses .78X more bytes in overhead). -// - sparse_hash_set has about 4 bits overhead per entry. -// - sparse_hash_set can be 3-7 times slower than the others for lookup and, -// especially, inserts. See time_hash_map.cc for details. -// -// See /usr/(local/)?doc/sparsehash-*/sparse_hash_set.html -// for information about how to use this class. - -#ifndef _SPARSE_HASH_SET_H_ -#define _SPARSE_HASH_SET_H_ - -#include -#include // needed by stl_alloc -#include // for equal_to<> -#include // for alloc (which we don't use) -#include // for pair<> -#include -#include // IWYU pragma: export -#include HASH_FUN_H // for hash<> - -_START_GOOGLE_NAMESPACE_ - -template , // defined in sparseconfig.h - class EqualKey = std::equal_to, - class Alloc = libc_allocator_with_realloc > -class sparse_hash_set { - private: - // Apparently identity is not stl-standard, so we define our own - struct Identity { - typedef const Value& result_type; - const Value& operator()(const Value& v) const { return v; } - }; - struct SetKey { - void operator()(Value* value, const Value& new_key) const { - *value = new_key; - } - }; - - typedef sparse_hashtable ht; - ht rep; - - public: - typedef typename ht::key_type key_type; - typedef typename ht::value_type value_type; - typedef typename ht::hasher hasher; - typedef typename ht::key_equal key_equal; - typedef Alloc allocator_type; - - typedef typename ht::size_type size_type; - typedef typename ht::difference_type difference_type; - typedef typename ht::const_pointer pointer; - typedef typename ht::const_pointer const_pointer; - typedef typename ht::const_reference reference; - typedef typename ht::const_reference const_reference; - - typedef typename ht::const_iterator iterator; - typedef typename ht::const_iterator const_iterator; - typedef typename ht::const_local_iterator local_iterator; - typedef typename ht::const_local_iterator const_local_iterator; - - - // Iterator functions -- recall all iterators are const - iterator begin() const { return rep.begin(); } - iterator end() const { return rep.end(); } - - // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements. - local_iterator begin(size_type i) const { return rep.begin(i); } - local_iterator end(size_type i) const { return rep.end(i); } - - - // Accessor functions - allocator_type get_allocator() const { return rep.get_allocator(); } - hasher hash_funct() const { return rep.hash_funct(); } - hasher hash_function() const { return hash_funct(); } // tr1 name - key_equal key_eq() const { return rep.key_eq(); } - - - // Constructors - explicit sparse_hash_set(size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { - } - - template - sparse_hash_set(InputIterator f, InputIterator l, - size_type expected_max_items_in_table = 0, - const hasher& hf = hasher(), - const key_equal& eql = key_equal(), - const allocator_type& alloc = allocator_type()) - : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { - rep.insert(f, l); - } - // We use the default copy constructor - // We use the default operator=() - // We use the default destructor - - void clear() { rep.clear(); } - void swap(sparse_hash_set& hs) { rep.swap(hs.rep); } - - - // Functions concerning size - size_type size() const { return rep.size(); } - size_type max_size() const { return rep.max_size(); } - bool empty() const { return rep.empty(); } - size_type bucket_count() const { return rep.bucket_count(); } - size_type max_bucket_count() const { return rep.max_bucket_count(); } - - // These are tr1 methods. bucket() is the bucket the key is or would be in. - size_type bucket_size(size_type i) const { return rep.bucket_size(i); } - size_type bucket(const key_type& key) const { return rep.bucket(key); } - float load_factor() const { - return size() * 1.0f / bucket_count(); - } - float max_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return grow; - } - void max_load_factor(float new_grow) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(shrink, new_grow); - } - // These aren't tr1 methods but perhaps ought to be. - float min_load_factor() const { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - return shrink; - } - void min_load_factor(float new_shrink) { - float shrink, grow; - rep.get_resizing_parameters(&shrink, &grow); - rep.set_resizing_parameters(new_shrink, grow); - } - // Deprecated; use min_load_factor() or max_load_factor() instead. - void set_resizing_parameters(float shrink, float grow) { - rep.set_resizing_parameters(shrink, grow); - } - - void resize(size_type hint) { rep.resize(hint); } - void rehash(size_type hint) { resize(hint); } // the tr1 name - - // Lookup routines - iterator find(const key_type& key) const { return rep.find(key); } - - size_type count(const key_type& key) const { return rep.count(key); } - - std::pair equal_range(const key_type& key) const { - return rep.equal_range(key); - } - - - // Insertion routines - std::pair insert(const value_type& obj) { - std::pair p = rep.insert(obj); - return std::pair(p.first, p.second); // const to non-const - } - template void insert(InputIterator f, InputIterator l) { - rep.insert(f, l); - } - void insert(const_iterator f, const_iterator l) { - rep.insert(f, l); - } - // Required for std::insert_iterator; the passed-in iterator is ignored. - iterator insert(iterator, const value_type& obj) { - return insert(obj).first; - } - - // Deletion routines - // THESE ARE NON-STANDARD! I make you specify an "impossible" key - // value to identify deleted buckets. You can change the key as - // time goes on, or get rid of it entirely to be insert-only. - void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } - void clear_deleted_key() { rep.clear_deleted_key(); } - key_type deleted_key() const { return rep.deleted_key(); } - - // These are standard - size_type erase(const key_type& key) { return rep.erase(key); } - void erase(iterator it) { rep.erase(it); } - void erase(iterator f, iterator l) { rep.erase(f, l); } - - - // Comparison - bool operator==(const sparse_hash_set& hs) const { return rep == hs.rep; } - bool operator!=(const sparse_hash_set& hs) const { return rep != hs.rep; } - - - // I/O -- this is an add-on for writing metainformation to disk - // - // For maximum flexibility, this does not assume a particular - // file type (though it will probably be a FILE *). We just pass - // the fp through to rep. - - // If your keys and values are simple enough, you can pass this - // serializer to serialize()/unserialize(). "Simple enough" means - // value_type is a POD type that contains no pointers. Note, - // however, we don't try to normalize endianness. - typedef typename ht::NopointerSerializer NopointerSerializer; - - // serializer: a class providing operator()(OUTPUT*, const value_type&) - // (writing value_type to OUTPUT). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a - // pointer to a class providing size_t Write(const void*, size_t), - // which writes a buffer into a stream (which fp presumably - // owns) and returns the number of bytes successfully written. - // Note basic_ostream is not currently supported. - template - bool serialize(ValueSerializer serializer, OUTPUT* fp) { - return rep.serialize(serializer, fp); - } - - // serializer: a functor providing operator()(INPUT*, value_type*) - // (reading from INPUT and into value_type). You can specify a - // NopointerSerializer object if appropriate (see above). - // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a - // pointer to a class providing size_t Read(void*, size_t), - // which reads into a buffer from a stream (which fp presumably - // owns) and returns the number of bytes successfully read. - // Note basic_istream is not currently supported. - // NOTE: Since value_type is const Key, ValueSerializer - // may need to do a const cast in order to fill in the key. - // NOTE: if Key is not a POD type, the serializer MUST use - // placement-new to initialize its value, rather than a normal - // equals-assignment or similar. (The value_type* passed into - // the serializer points to garbage memory.) - template - bool unserialize(ValueSerializer serializer, INPUT* fp) { - return rep.unserialize(serializer, fp); - } - - // The four methods below are DEPRECATED. - // Use serialize() and unserialize() for new code. - template - bool write_metadata(OUTPUT *fp) { return rep.write_metadata(fp); } - - template - bool read_metadata(INPUT *fp) { return rep.read_metadata(fp); } - - template - bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); } - - template - bool read_nopointer_data(INPUT *fp) { return rep.read_nopointer_data(fp); } -}; - -template -inline void swap(sparse_hash_set& hs1, - sparse_hash_set& hs2) { - hs1.swap(hs2); -} - -_END_GOOGLE_NAMESPACE_ - -#endif /* _SPARSE_HASH_SET_H_ */ diff --git a/contrib/libsparsehash/sparsehash/sparsetable b/contrib/libsparsehash/sparsehash/sparsetable deleted file mode 100644 index d162623a5f5..00000000000 --- a/contrib/libsparsehash/sparsehash/sparsetable +++ /dev/null @@ -1,1820 +0,0 @@ -// Copyright (c) 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// --- -// -// -// A sparsetable is a random container that implements a sparse array, -// that is, an array that uses very little memory to store unassigned -// indices (in this case, between 1-2 bits per unassigned index). For -// instance, if you allocate an array of size 5 and assign a[2] = , then a[2] will take up a lot of memory but a[0], a[1], -// a[3], and a[4] will not. Array elements that have a value are -// called "assigned". Array elements that have no value yet, or have -// had their value cleared using erase() or clear(), are called -// "unassigned". -// -// Unassigned values seem to have the default value of T (see below). -// Nevertheless, there is a difference between an unassigned index and -// one explicitly assigned the value of T(). The latter is considered -// assigned. -// -// Access to an array element is constant time, as is insertion and -// deletion. Insertion and deletion may be fairly slow, however: -// because of this container's memory economy, each insert and delete -// causes a memory reallocation. -// -// NOTE: You should not test(), get(), or set() any index that is -// greater than sparsetable.size(). If you need to do that, call -// resize() first. -// -// --- Template parameters -// PARAMETER DESCRIPTION DEFAULT -// T The value of the array: the type of -- -// object that is stored in the array. -// -// GROUP_SIZE How large each "group" in the table 48 -// is (see below). Larger values use -// a little less memory but cause most -// operations to be a little slower -// -// Alloc: Allocator to use to allocate memory. libc_allocator_with_realloc -// -// --- Model of -// Random Access Container -// -// --- Type requirements -// T must be Copy Constructible. It need not be Assignable. -// -// --- Public base classes -// None. -// -// --- Members -// Type members -// -// MEMBER WHERE DEFINED DESCRIPTION -// value_type container The type of object, T, stored in the array -// allocator_type container Allocator to use -// pointer container Pointer to p -// const_pointer container Const pointer to p -// reference container Reference to t -// const_reference container Const reference to t -// size_type container An unsigned integral type -// difference_type container A signed integral type -// iterator [*] container Iterator used to iterate over a sparsetable -// const_iterator container Const iterator used to iterate over a table -// reverse_iterator reversible Iterator used to iterate backwards over -// container a sparsetable -// const_reverse_iterator reversible container Guess -// nonempty_iterator [+] sparsetable Iterates over assigned -// array elements only -// const_nonempty_iterator sparsetable Iterates over assigned -// array elements only -// reverse_nonempty_iterator sparsetable Iterates backwards over -// assigned array elements only -// const_reverse_nonempty_iterator sparsetable Iterates backwards over -// assigned array elements only -// -// [*] All iterators are const in a sparsetable (though nonempty_iterators -// may not be). Use get() and set() to assign values, not iterators. -// -// [+] iterators are random-access iterators. nonempty_iterators are -// bidirectional iterators. - -// Iterator members -// MEMBER WHERE DEFINED DESCRIPTION -// -// iterator begin() container An iterator to the beginning of the table -// iterator end() container An iterator to the end of the table -// const_iterator container A const_iterator pointing to the -// begin() const beginning of a sparsetable -// const_iterator container A const_iterator pointing to the -// end() const end of a sparsetable -// -// reverse_iterator reversable Points to beginning of a reversed -// rbegin() container sparsetable -// reverse_iterator reversable Points to end of a reversed table -// rend() container -// const_reverse_iterator reversable Points to beginning of a -// rbegin() const container reversed sparsetable -// const_reverse_iterator reversable Points to end of a reversed table -// rend() const container -// -// nonempty_iterator sparsetable Points to first assigned element -// begin() of a sparsetable -// nonempty_iterator sparsetable Points past last assigned element -// end() of a sparsetable -// const_nonempty_iterator sparsetable Points to first assigned element -// begin() const of a sparsetable -// const_nonempty_iterator sparsetable Points past last assigned element -// end() const of a sparsetable -// -// reverse_nonempty_iterator sparsetable Points to first assigned element -// begin() of a reversed sparsetable -// reverse_nonempty_iterator sparsetable Points past last assigned element -// end() of a reversed sparsetable -// const_reverse_nonempty_iterator sparsetable Points to first assigned -// begin() const elt of a reversed sparsetable -// const_reverse_nonempty_iterator sparsetable Points past last assigned -// end() const elt of a reversed sparsetable -// -// -// Other members -// MEMBER WHERE DEFINED DESCRIPTION -// sparsetable() sparsetable A table of size 0; must resize() -// before using. -// sparsetable(size_type size) sparsetable A table of size size. All -// indices are unassigned. -// sparsetable( -// const sparsetable &tbl) sparsetable Copy constructor -// ~sparsetable() sparsetable The destructor -// sparsetable &operator=( sparsetable The assignment operator -// const sparsetable &tbl) -// -// void resize(size_type size) sparsetable Grow or shrink a table to -// have size indices [*] -// -// void swap(sparsetable &x) sparsetable Swap two sparsetables -// void swap(sparsetable &x, sparsetable Swap two sparsetables -// sparsetable &y) (global, not member, function) -// -// size_type size() const sparsetable Number of "buckets" in the table -// size_type max_size() const sparsetable Max allowed size of a sparsetable -// bool empty() const sparsetable true if size() == 0 -// size_type num_nonempty() const sparsetable Number of assigned "buckets" -// -// const_reference get( sparsetable Value at index i, or default -// size_type i) const value if i is unassigned -// const_reference operator[]( sparsetable Identical to get(i) [+] -// difference_type i) const -// reference set(size_type i, sparsetable Set element at index i to -// const_reference val) be a copy of val -// bool test(size_type i) sparsetable True if element at index i -// const has been assigned to -// bool test(iterator pos) sparsetable True if element pointed to -// const by pos has been assigned to -// void erase(iterator pos) sparsetable Set element pointed to by -// pos to be unassigned [!] -// void erase(size_type i) sparsetable Set element i to be unassigned -// void erase(iterator start, sparsetable Erases all elements between -// iterator end) start and end -// void clear() sparsetable Erases all elements in the table -// -// I/O versions exist for both FILE* and for File* (Google2-style files): -// bool write_metadata(FILE *fp) sparsetable Writes a sparsetable to the -// bool write_metadata(File *fp) given file. true if write -// completes successfully -// bool read_metadata(FILE *fp) sparsetable Replaces sparsetable with -// bool read_metadata(File *fp) version read from fp. true -// if read completes sucessfully -// bool write_nopointer_data(FILE *fp) Read/write the data stored in -// bool read_nopointer_data(FILE*fp) the table, if it's simple -// -// bool operator==( forward Tests two tables for equality. -// const sparsetable &t1, container This is a global function, -// const sparsetable &t2) not a member function. -// bool operator<( forward Lexicographical comparison. -// const sparsetable &t1, container This is a global function, -// const sparsetable &t2) not a member function. -// -// [*] If you shrink a sparsetable using resize(), assigned elements -// past the end of the table are removed using erase(). If you grow -// a sparsetable, new unassigned indices are created. -// -// [+] Note that operator[] returns a const reference. You must use -// set() to change the value of a table element. -// -// [!] Unassignment also calls the destructor. -// -// Iterators are invalidated whenever an item is inserted or -// deleted (ie set() or erase() is used) or when the size of -// the table changes (ie resize() or clear() is used). -// -// See doc/sparsetable.html for more information about how to use this class. - -// Note: this uses STL style for naming, rather than Google naming. -// That's because this is an STL-y container - -#ifndef UTIL_GTL_SPARSETABLE_H_ -#define UTIL_GTL_SPARSETABLE_H_ - -#include -#include // for malloc/free -#include // to read/write tables -#include // for memcpy -#ifdef HAVE_STDINT_H -#include // the normal place uint16_t is defined -#endif -#ifdef HAVE_SYS_TYPES_H -#include // the normal place u_int16_t is defined -#endif -#ifdef HAVE_INTTYPES_H -#include // a third place for uint16_t or u_int16_t -#endif -#include // for bounds checking -#include // to define reverse_iterator for me -#include // equal, lexicographical_compare, swap,... -#include // uninitialized_copy, uninitialized_fill -#include // a sparsetable is a vector of groups -#include -#include -#include - -// A lot of work to get a type that's guaranteed to be 16 bits... -#ifndef HAVE_U_INT16_T -# if defined HAVE_UINT16_T - typedef uint16_t u_int16_t; // true on solaris, possibly other C99 libc's -# elif defined HAVE___UINT16 - typedef __int16 int16_t; // true on vc++7 - typedef unsigned __int16 u_int16_t; -# else - // Cannot find a 16-bit integer type. Hoping for the best with "short"... - typedef short int int16_t; - typedef unsigned short int u_int16_t; -# endif -#endif - -_START_GOOGLE_NAMESPACE_ - -namespace base { // just to make google->opensource transition easier -using GOOGLE_NAMESPACE::true_type; -using GOOGLE_NAMESPACE::false_type; -using GOOGLE_NAMESPACE::integral_constant; -using GOOGLE_NAMESPACE::has_trivial_copy; -using GOOGLE_NAMESPACE::has_trivial_destructor; -using GOOGLE_NAMESPACE::is_same; -} - - -// The smaller this is, the faster lookup is (because the group bitmap is -// smaller) and the faster insert is, because there's less to move. -// On the other hand, there are more groups. Since group::size_type is -// a short, this number should be of the form 32*x + 16 to avoid waste. -static const u_int16_t DEFAULT_SPARSEGROUP_SIZE = 48; // fits in 1.5 words - - -// Our iterator as simple as iterators can be: basically it's just -// the index into our table. Dereference, the only complicated -// thing, we punt to the table class. This just goes to show how -// much machinery STL requires to do even the most trivial tasks. -// -// A NOTE ON ASSIGNING: -// A sparse table does not actually allocate memory for entries -// that are not filled. Because of this, it becomes complicated -// to have a non-const iterator: we don't know, if the iterator points -// to a not-filled bucket, whether you plan to fill it with something -// or whether you plan to read its value (in which case you'll get -// the default bucket value). Therefore, while we can define const -// operations in a pretty 'normal' way, for non-const operations, we -// define something that returns a helper object with operator= and -// operator& that allocate a bucket lazily. We use this for table[] -// and also for regular table iterators. - -template -class table_element_adaptor { - public: - typedef typename tabletype::value_type value_type; - typedef typename tabletype::size_type size_type; - typedef typename tabletype::reference reference; - typedef typename tabletype::pointer pointer; - - table_element_adaptor(tabletype *tbl, size_type p) - : table(tbl), pos(p) { } - table_element_adaptor& operator= (const value_type &val) { - table->set(pos, val); - return *this; - } - operator value_type() { return table->get(pos); } // we look like a value - pointer operator& () { return &table->mutating_get(pos); } - - private: - tabletype* table; - size_type pos; -}; - -// Our iterator as simple as iterators can be: basically it's just -// the index into our table. Dereference, the only complicated -// thing, we punt to the table class. This just goes to show how -// much machinery STL requires to do even the most trivial tasks. -// -// By templatizing over tabletype, we have one iterator type which -// we can use for both sparsetables and sparsebins. In fact it -// works on any class that allows size() and operator[] (eg vector), -// as long as it does the standard STL typedefs too (eg value_type). - -template -class table_iterator { - public: - typedef table_iterator iterator; - - typedef std::random_access_iterator_tag iterator_category; - typedef typename tabletype::value_type value_type; - typedef typename tabletype::difference_type difference_type; - typedef typename tabletype::size_type size_type; - typedef table_element_adaptor reference; - typedef table_element_adaptor* pointer; - - // The "real" constructor - table_iterator(tabletype *tbl, size_type p) - : table(tbl), pos(p) { } - // The default constructor, used when I define vars of type table::iterator - table_iterator() : table(NULL), pos(0) { } - // The copy constructor, for when I say table::iterator foo = tbl.begin() - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // The main thing our iterator does is dereference. If the table entry - // we point to is empty, we return the default value type. - // This is the big different function from the const iterator. - reference operator*() { - return table_element_adaptor(table, pos); - } - pointer operator->() { return &(operator*()); } - - // Helper function to assert things are ok; eg pos is still in range - void check() const { - assert(table); - assert(pos <= table->size()); - } - - // Arithmetic: we just do arithmetic on pos. We don't even need to - // do bounds checking, since STL doesn't consider that its job. :-) - iterator& operator+=(size_type t) { pos += t; check(); return *this; } - iterator& operator-=(size_type t) { pos -= t; check(); return *this; } - iterator& operator++() { ++pos; check(); return *this; } - iterator& operator--() { --pos; check(); return *this; } - iterator operator++(int) { iterator tmp(*this); // for x++ - ++pos; check(); return tmp; } - iterator operator--(int) { iterator tmp(*this); // for x-- - --pos; check(); return tmp; } - iterator operator+(difference_type i) const { iterator tmp(*this); - tmp += i; return tmp; } - iterator operator-(difference_type i) const { iterator tmp(*this); - tmp -= i; return tmp; } - difference_type operator-(iterator it) const { // for "x = it2 - it" - assert(table == it.table); - return pos - it.pos; - } - reference operator[](difference_type n) const { - return *(*this + n); // simple though not totally efficient - } - - // Comparisons. - bool operator==(const iterator& it) const { - return table == it.table && pos == it.pos; - } - bool operator<(const iterator& it) const { - assert(table == it.table); // life is bad bad bad otherwise - return pos < it.pos; - } - bool operator!=(const iterator& it) const { return !(*this == it); } - bool operator<=(const iterator& it) const { return !(it < *this); } - bool operator>(const iterator& it) const { return it < *this; } - bool operator>=(const iterator& it) const { return !(*this < it); } - - // Here's the info we actually need to be an iterator - tabletype *table; // so we can dereference and bounds-check - size_type pos; // index into the table -}; - -// support for "3 + iterator" has to be defined outside the class, alas -template -table_iterator operator+(typename table_iterator::difference_type i, - table_iterator it) { - return it + i; // so people can say it2 = 3 + it -} - -template -class const_table_iterator { - public: - typedef table_iterator iterator; - typedef const_table_iterator const_iterator; - - typedef std::random_access_iterator_tag iterator_category; - typedef typename tabletype::value_type value_type; - typedef typename tabletype::difference_type difference_type; - typedef typename tabletype::size_type size_type; - typedef typename tabletype::const_reference reference; // we're const-only - typedef typename tabletype::const_pointer pointer; - - // The "real" constructor - const_table_iterator(const tabletype *tbl, size_type p) - : table(tbl), pos(p) { } - // The default constructor, used when I define vars of type table::iterator - const_table_iterator() : table(NULL), pos(0) { } - // The copy constructor, for when I say table::iterator foo = tbl.begin() - // Also converts normal iterators to const iterators - const_table_iterator(const iterator &from) - : table(from.table), pos(from.pos) { } - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // The main thing our iterator does is dereference. If the table entry - // we point to is empty, we return the default value type. - reference operator*() const { return (*table)[pos]; } - pointer operator->() const { return &(operator*()); } - - // Helper function to assert things are ok; eg pos is still in range - void check() const { - assert(table); - assert(pos <= table->size()); - } - - // Arithmetic: we just do arithmetic on pos. We don't even need to - // do bounds checking, since STL doesn't consider that its job. :-) - const_iterator& operator+=(size_type t) { pos += t; check(); return *this; } - const_iterator& operator-=(size_type t) { pos -= t; check(); return *this; } - const_iterator& operator++() { ++pos; check(); return *this; } - const_iterator& operator--() { --pos; check(); return *this; } - const_iterator operator++(int) { const_iterator tmp(*this); // for x++ - ++pos; check(); return tmp; } - const_iterator operator--(int) { const_iterator tmp(*this); // for x-- - --pos; check(); return tmp; } - const_iterator operator+(difference_type i) const { const_iterator tmp(*this); - tmp += i; return tmp; } - const_iterator operator-(difference_type i) const { const_iterator tmp(*this); - tmp -= i; return tmp; } - difference_type operator-(const_iterator it) const { // for "x = it2 - it" - assert(table == it.table); - return pos - it.pos; - } - reference operator[](difference_type n) const { - return *(*this + n); // simple though not totally efficient - } - - // Comparisons. - bool operator==(const const_iterator& it) const { - return table == it.table && pos == it.pos; - } - bool operator<(const const_iterator& it) const { - assert(table == it.table); // life is bad bad bad otherwise - return pos < it.pos; - } - bool operator!=(const const_iterator& it) const { return !(*this == it); } - bool operator<=(const const_iterator& it) const { return !(it < *this); } - bool operator>(const const_iterator& it) const { return it < *this; } - bool operator>=(const const_iterator& it) const { return !(*this < it); } - - // Here's the info we actually need to be an iterator - const tabletype *table; // so we can dereference and bounds-check - size_type pos; // index into the table -}; - -// support for "3 + iterator" has to be defined outside the class, alas -template -const_table_iterator operator+(typename - const_table_iterator::difference_type i, - const_table_iterator it) { - return it + i; // so people can say it2 = 3 + it -} - - -// --------------------------------------------------------------------------- - - -/* -// This is a 2-D iterator. You specify a begin and end over a list -// of *containers*. We iterate over each container by iterating over -// it. It's actually simple: -// VECTOR.begin() VECTOR[0].begin() --------> VECTOR[0].end() ---, -// | ________________________________________________/ -// | \_> VECTOR[1].begin() --------> VECTOR[1].end() -, -// | ___________________________________________________/ -// v \_> ...... -// VECTOR.end() -// -// It's impossible to do random access on one of these things in constant -// time, so it's just a bidirectional iterator. -// -// Unfortunately, because we need to use this for a non-empty iterator, -// we use nonempty_begin() and nonempty_end() instead of begin() and end() -// (though only going across, not down). -*/ - -#define TWOD_BEGIN_ nonempty_begin -#define TWOD_END_ nonempty_end -#define TWOD_ITER_ nonempty_iterator -#define TWOD_CONST_ITER_ const_nonempty_iterator - -template -class two_d_iterator { - public: - typedef two_d_iterator iterator; - - typedef std::bidirectional_iterator_tag iterator_category; - // apparently some versions of VC++ have trouble with two ::'s in a typename - typedef typename containertype::value_type _tmp_vt; - typedef typename _tmp_vt::value_type value_type; - typedef typename _tmp_vt::difference_type difference_type; - typedef typename _tmp_vt::reference reference; - typedef typename _tmp_vt::pointer pointer; - - // The "real" constructor. begin and end specify how many rows we have - // (in the diagram above); we always iterate over each row completely. - two_d_iterator(typename containertype::iterator begin, - typename containertype::iterator end, - typename containertype::iterator curr) - : row_begin(begin), row_end(end), row_current(curr), col_current() { - if ( row_current != row_end ) { - col_current = row_current->TWOD_BEGIN_(); - advance_past_end(); // in case cur->begin() == cur->end() - } - } - // If you want to start at an arbitrary place, you can, I guess - two_d_iterator(typename containertype::iterator begin, - typename containertype::iterator end, - typename containertype::iterator curr, - typename containertype::value_type::TWOD_ITER_ col) - : row_begin(begin), row_end(end), row_current(curr), col_current(col) { - advance_past_end(); // in case cur->begin() == cur->end() - } - // The default constructor, used when I define vars of type table::iterator - two_d_iterator() : row_begin(), row_end(), row_current(), col_current() { } - // The default destructor is fine; we don't define one - // The default operator= is fine; we don't define one - - // Happy dereferencer - reference operator*() const { return *col_current; } - pointer operator->() const { return &(operator*()); } - - // Arithmetic: we just do arithmetic on pos. We don't even need to - // do bounds checking, since STL doesn't consider that its job. :-) - // NOTE: this is not amortized constant time! What do we do about it? - void advance_past_end() { // used when col_current points to end() - while ( col_current == row_current->TWOD_END_() ) { // end of current row - ++row_current; // go to beginning of next - if ( row_current != row_end ) // col is irrelevant at end - col_current = row_current->TWOD_BEGIN_(); - else - break; // don't go past row_end - } - } - - iterator& operator++() { - assert(row_current != row_end); // how to ++ from there? - ++col_current; - advance_past_end(); // in case col_current is at end() - return *this; - } - iterator& operator--() { - while ( row_current == row_end || - col_current == row_current->TWOD_BEGIN_() ) { - assert(row_current != row_begin); - --row_current; - col_current = row_current->TWOD_END_(); // this is 1 too far - } - --col_current; - return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - iterator operator--(int) { iterator tmp(*this); --*this; return tmp; } - - - // Comparisons. - bool operator==(const iterator& it) const { - return ( row_begin == it.row_begin && - row_end == it.row_end && - row_current == it.row_current && - (row_current == row_end || col_current == it.col_current) ); - } - bool operator!=(const iterator& it) const { return !(*this == it); } - - - // Here's the info we actually need to be an iterator - // These need to be public so we convert from iterator to const_iterator - typename containertype::iterator row_begin, row_end, row_current; - typename containertype::value_type::TWOD_ITER_ col_current; -}; - -// The same thing again, but this time const. :-( -template -class const_two_d_iterator { - public: - typedef const_two_d_iterator iterator; - - typedef std::bidirectional_iterator_tag iterator_category; - // apparently some versions of VC++ have trouble with two ::'s in a typename - typedef typename containertype::value_type _tmp_vt; - typedef typename _tmp_vt::value_type value_type; - typedef typename _tmp_vt::difference_type difference_type; - typedef typename _tmp_vt::const_reference reference; - typedef typename _tmp_vt::const_pointer pointer; - - const_two_d_iterator(typename containertype::const_iterator begin, - typename containertype::const_iterator end, - typename containertype::const_iterator curr) - : row_begin(begin), row_end(end), row_current(curr), col_current() { - if ( curr != end ) { - col_current = curr->TWOD_BEGIN_(); - advance_past_end(); // in case cur->begin() == cur->end() - } - } - const_two_d_iterator(typename containertype::const_iterator begin, - typename containertype::const_iterator end, - typename containertype::const_iterator curr, - typename containertype::value_type::TWOD_CONST_ITER_ col) - : row_begin(begin), row_end(end), row_current(curr), col_current(col) { - advance_past_end(); // in case cur->begin() == cur->end() - } - const_two_d_iterator() - : row_begin(), row_end(), row_current(), col_current() { - } - // Need this explicitly so we can convert normal iterators to const iterators - const_two_d_iterator(const two_d_iterator& it) : - row_begin(it.row_begin), row_end(it.row_end), row_current(it.row_current), - col_current(it.col_current) { } - - typename containertype::const_iterator row_begin, row_end, row_current; - typename containertype::value_type::TWOD_CONST_ITER_ col_current; - - - // EVERYTHING FROM HERE DOWN IS THE SAME AS THE NON-CONST ITERATOR - reference operator*() const { return *col_current; } - pointer operator->() const { return &(operator*()); } - - void advance_past_end() { // used when col_current points to end() - while ( col_current == row_current->TWOD_END_() ) { // end of current row - ++row_current; // go to beginning of next - if ( row_current != row_end ) // col is irrelevant at end - col_current = row_current->TWOD_BEGIN_(); - else - break; // don't go past row_end - } - } - iterator& operator++() { - assert(row_current != row_end); // how to ++ from there? - ++col_current; - advance_past_end(); // in case col_current is at end() - return *this; - } - iterator& operator--() { - while ( row_current == row_end || - col_current == row_current->TWOD_BEGIN_() ) { - assert(row_current != row_begin); - --row_current; - col_current = row_current->TWOD_END_(); // this is 1 too far - } - --col_current; - return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - iterator operator--(int) { iterator tmp(*this); --*this; return tmp; } - - bool operator==(const iterator& it) const { - return ( row_begin == it.row_begin && - row_end == it.row_end && - row_current == it.row_current && - (row_current == row_end || col_current == it.col_current) ); - } - bool operator!=(const iterator& it) const { return !(*this == it); } -}; - -// We provide yet another version, to be as frugal with memory as -// possible. This one frees each block of memory as it finishes -// iterating over it. By the end, the entire table is freed. -// For understandable reasons, you can only iterate over it once, -// which is why it's an input iterator -template -class destructive_two_d_iterator { - public: - typedef destructive_two_d_iterator iterator; - - typedef std::input_iterator_tag iterator_category; - // apparently some versions of VC++ have trouble with two ::'s in a typename - typedef typename containertype::value_type _tmp_vt; - typedef typename _tmp_vt::value_type value_type; - typedef typename _tmp_vt::difference_type difference_type; - typedef typename _tmp_vt::reference reference; - typedef typename _tmp_vt::pointer pointer; - - destructive_two_d_iterator(typename containertype::iterator begin, - typename containertype::iterator end, - typename containertype::iterator curr) - : row_begin(begin), row_end(end), row_current(curr), col_current() { - if ( curr != end ) { - col_current = curr->TWOD_BEGIN_(); - advance_past_end(); // in case cur->begin() == cur->end() - } - } - destructive_two_d_iterator(typename containertype::iterator begin, - typename containertype::iterator end, - typename containertype::iterator curr, - typename containertype::value_type::TWOD_ITER_ col) - : row_begin(begin), row_end(end), row_current(curr), col_current(col) { - advance_past_end(); // in case cur->begin() == cur->end() - } - destructive_two_d_iterator() - : row_begin(), row_end(), row_current(), col_current() { - } - - typename containertype::iterator row_begin, row_end, row_current; - typename containertype::value_type::TWOD_ITER_ col_current; - - // This is the part that destroys - void advance_past_end() { // used when col_current points to end() - while ( col_current == row_current->TWOD_END_() ) { // end of current row - row_current->clear(); // the destructive part - // It would be nice if we could decrement sparsetable->num_buckets here - ++row_current; // go to beginning of next - if ( row_current != row_end ) // col is irrelevant at end - col_current = row_current->TWOD_BEGIN_(); - else - break; // don't go past row_end - } - } - - // EVERYTHING FROM HERE DOWN IS THE SAME AS THE REGULAR ITERATOR - reference operator*() const { return *col_current; } - pointer operator->() const { return &(operator*()); } - - iterator& operator++() { - assert(row_current != row_end); // how to ++ from there? - ++col_current; - advance_past_end(); // in case col_current is at end() - return *this; - } - iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; } - - bool operator==(const iterator& it) const { - return ( row_begin == it.row_begin && - row_end == it.row_end && - row_current == it.row_current && - (row_current == row_end || col_current == it.col_current) ); - } - bool operator!=(const iterator& it) const { return !(*this == it); } -}; - -#undef TWOD_BEGIN_ -#undef TWOD_END_ -#undef TWOD_ITER_ -#undef TWOD_CONST_ITER_ - - - - -// SPARSE-TABLE -// ------------ -// The idea is that a table with (logically) t buckets is divided -// into t/M *groups* of M buckets each. (M is a constant set in -// GROUP_SIZE for efficiency.) Each group is stored sparsely. -// Thus, inserting into the table causes some array to grow, which is -// slow but still constant time. Lookup involves doing a -// logical-position-to-sparse-position lookup, which is also slow but -// constant time. The larger M is, the slower these operations are -// but the less overhead (slightly). -// -// To store the sparse array, we store a bitmap B, where B[i] = 1 iff -// bucket i is non-empty. Then to look up bucket i we really look up -// array[# of 1s before i in B]. This is constant time for fixed M. -// -// Terminology: the position of an item in the overall table (from -// 1 .. t) is called its "location." The logical position in a group -// (from 1 .. M ) is called its "position." The actual location in -// the array (from 1 .. # of non-empty buckets in the group) is -// called its "offset." - -template -class sparsegroup { - private: - typedef typename Alloc::template rebind::other value_alloc_type; - - public: - // Basic types - typedef T value_type; - typedef Alloc allocator_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::const_reference const_reference; - typedef typename value_alloc_type::pointer pointer; - typedef typename value_alloc_type::const_pointer const_pointer; - - typedef table_iterator > iterator; - typedef const_table_iterator > - const_iterator; - typedef table_element_adaptor > - element_adaptor; - typedef u_int16_t size_type; // max # of buckets - typedef int16_t difference_type; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; // from iterator.h - - // These are our special iterators, that go over non-empty buckets in a - // group. These aren't const-only because you can change non-empty bcks. - typedef pointer nonempty_iterator; - typedef const_pointer const_nonempty_iterator; - typedef std::reverse_iterator reverse_nonempty_iterator; - typedef std::reverse_iterator const_reverse_nonempty_iterator; - - // Iterator functions - iterator begin() { return iterator(this, 0); } - const_iterator begin() const { return const_iterator(this, 0); } - iterator end() { return iterator(this, size()); } - const_iterator end() const { return const_iterator(this, size()); } - reverse_iterator rbegin() { return reverse_iterator(end()); } - const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } - reverse_iterator rend() { return reverse_iterator(begin()); } - const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } - - // We'll have versions for our special non-empty iterator too - nonempty_iterator nonempty_begin() { return group; } - const_nonempty_iterator nonempty_begin() const { return group; } - nonempty_iterator nonempty_end() { - return group + settings.num_buckets; - } - const_nonempty_iterator nonempty_end() const { - return group + settings.num_buckets; - } - reverse_nonempty_iterator nonempty_rbegin() { - return reverse_nonempty_iterator(nonempty_end()); - } - const_reverse_nonempty_iterator nonempty_rbegin() const { - return const_reverse_nonempty_iterator(nonempty_end()); - } - reverse_nonempty_iterator nonempty_rend() { - return reverse_nonempty_iterator(nonempty_begin()); - } - const_reverse_nonempty_iterator nonempty_rend() const { - return const_reverse_nonempty_iterator(nonempty_begin()); - } - - - // This gives us the "default" value to return for an empty bucket. - // We just use the default constructor on T, the template type - const_reference default_value() const { - static value_type defaultval = value_type(); - return defaultval; - } - - - private: - // We need to do all this bit manipulation, of course. ick - static size_type charbit(size_type i) { return i >> 3; } - static size_type modbit(size_type i) { return 1 << (i&7); } - int bmtest(size_type i) const { return bitmap[charbit(i)] & modbit(i); } - void bmset(size_type i) { bitmap[charbit(i)] |= modbit(i); } - void bmclear(size_type i) { bitmap[charbit(i)] &= ~modbit(i); } - - pointer allocate_group(size_type n) { - pointer retval = settings.allocate(n); - if (retval == NULL) { - // We really should use PRIuS here, but I don't want to have to add - // a whole new configure option, with concomitant macro namespace - // pollution, just to print this (unlikely) error message. So I cast. - fprintf(stderr, "sparsehash FATAL ERROR: failed to allocate %lu groups\n", - static_cast(n)); - exit(1); - } - return retval; - } - - void free_group() { - if (!group) return; - pointer end_it = group + settings.num_buckets; - for (pointer p = group; p != end_it; ++p) - p->~value_type(); - settings.deallocate(group, settings.num_buckets); - group = NULL; - } - - static size_type bits_in_char(unsigned char c) { - // We could make these ints. The tradeoff is size (eg does it overwhelm - // the cache?) vs efficiency in referencing sub-word-sized array elements. - static const char bits_in[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, - }; - return bits_in[c]; - } - - public: // get_iter() in sparsetable needs it - // We need a small function that tells us how many set bits there are - // in positions 0..i-1 of the bitmap. It uses a big table. - // We make it static so templates don't allocate lots of these tables. - // There are lots of ways to do this calculation (called 'popcount'). - // The 8-bit table lookup is one of the fastest, though this - // implementation suffers from not doing any loop unrolling. See, eg, - // http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html - // http://gurmeetsingh.wordpress.com/2008/08/05/fast-bit-counting-routines/ - static size_type pos_to_offset(const unsigned char *bm, size_type pos) { - size_type retval = 0; - - // [Note: condition pos > 8 is an optimization; convince yourself we - // give exactly the same result as if we had pos >= 8 here instead.] - for ( ; pos > 8; pos -= 8 ) // bm[0..pos/8-1] - retval += bits_in_char(*bm++); // chars we want *all* bits in - return retval + bits_in_char(*bm & ((1 << pos)-1)); // char including pos - } - - size_type pos_to_offset(size_type pos) const { // not static but still const - return pos_to_offset(bitmap, pos); - } - - // Returns the (logical) position in the bm[] array, i, such that - // bm[i] is the offset-th set bit in the array. It is the inverse - // of pos_to_offset. get_pos() uses this function to find the index - // of an nonempty_iterator in the table. Bit-twiddling from - // http://hackersdelight.org/basics.pdf - static size_type offset_to_pos(const unsigned char *bm, size_type offset) { - size_type retval = 0; - // This is sizeof(this->bitmap). - const size_type group_size = (GROUP_SIZE-1) / 8 + 1; - for (size_type i = 0; i < group_size; i++) { // forward scan - const size_type pop_count = bits_in_char(*bm); - if (pop_count > offset) { - unsigned char last_bm = *bm; - for (; offset > 0; offset--) { - last_bm &= (last_bm-1); // remove right-most set bit - } - // Clear all bits to the left of the rightmost bit (the &), - // and then clear the rightmost bit but set all bits to the - // right of it (the -1). - last_bm = (last_bm & -last_bm) - 1; - retval += bits_in_char(last_bm); - return retval; - } - offset -= pop_count; - retval += 8; - bm++; - } - return retval; - } - - size_type offset_to_pos(size_type offset) const { - return offset_to_pos(bitmap, offset); - } - - - public: - // Constructors -- default and copy -- and destructor - explicit sparsegroup(allocator_type& a) : - group(0), settings(alloc_impl(a)) { - memset(bitmap, 0, sizeof(bitmap)); - } - sparsegroup(const sparsegroup& x) : group(0), settings(x.settings) { - if ( settings.num_buckets ) { - group = allocate_group(x.settings.num_buckets); - std::uninitialized_copy(x.group, x.group + x.settings.num_buckets, group); - } - memcpy(bitmap, x.bitmap, sizeof(bitmap)); - } - ~sparsegroup() { free_group(); } - - // Operator= is just like the copy constructor, I guess - // TODO(austern): Make this exception safe. Handle exceptions in value_type's - // copy constructor. - sparsegroup &operator=(const sparsegroup& x) { - if ( &x == this ) return *this; // x = x - if ( x.settings.num_buckets == 0 ) { - free_group(); - } else { - pointer p = allocate_group(x.settings.num_buckets); - std::uninitialized_copy(x.group, x.group + x.settings.num_buckets, p); - free_group(); - group = p; - } - memcpy(bitmap, x.bitmap, sizeof(bitmap)); - settings.num_buckets = x.settings.num_buckets; - return *this; - } - - // Many STL algorithms use swap instead of copy constructors - void swap(sparsegroup& x) { - std::swap(group, x.group); // defined in - for ( int i = 0; i < sizeof(bitmap) / sizeof(*bitmap); ++i ) - std::swap(bitmap[i], x.bitmap[i]); // swap not defined on arrays - std::swap(settings.num_buckets, x.settings.num_buckets); - // we purposefully don't swap the allocator, which may not be swap-able - } - - // It's always nice to be able to clear a table without deallocating it - void clear() { - free_group(); - memset(bitmap, 0, sizeof(bitmap)); - settings.num_buckets = 0; - } - - // Functions that tell you about size. Alas, these aren't so useful - // because our table is always fixed size. - size_type size() const { return GROUP_SIZE; } - size_type max_size() const { return GROUP_SIZE; } - bool empty() const { return false; } - // We also may want to know how many *used* buckets there are - size_type num_nonempty() const { return settings.num_buckets; } - - - // get()/set() are explicitly const/non-const. You can use [] if - // you want something that can be either (potentially more expensive). - const_reference get(size_type i) const { - if ( bmtest(i) ) // bucket i is occupied - return group[pos_to_offset(bitmap, i)]; - else - return default_value(); // return the default reference - } - - // TODO(csilvers): make protected + friend - // This is used by sparse_hashtable to get an element from the table - // when we know it exists. - const_reference unsafe_get(size_type i) const { - assert(bmtest(i)); - return group[pos_to_offset(bitmap, i)]; - } - - // TODO(csilvers): make protected + friend - reference mutating_get(size_type i) { // fills bucket i before getting - if ( !bmtest(i) ) - set(i, default_value()); - return group[pos_to_offset(bitmap, i)]; - } - - // Syntactic sugar. It's easy to return a const reference. To - // return a non-const reference, we need to use the assigner adaptor. - const_reference operator[](size_type i) const { - return get(i); - } - - element_adaptor operator[](size_type i) { - return element_adaptor(this, i); - } - - private: - // Create space at group[offset], assuming value_type has trivial - // copy constructor and destructor, and the allocator_type is - // the default libc_allocator_with_alloc. (Really, we want it to have - // "trivial move", because that's what realloc and memmove both do. - // But there's no way to capture that using type_traits, so we - // pretend that move(x, y) is equivalent to "x.~T(); new(x) T(y);" - // which is pretty much correct, if a bit conservative.) - void set_aux(size_type offset, base::true_type) { - group = settings.realloc_or_die(group, settings.num_buckets+1); - // This is equivalent to memmove(), but faster on my Intel P4, - // at least with gcc4.1 -O2 / glibc 2.3.6. - for (size_type i = settings.num_buckets; i > offset; --i) - memcpy(static_cast(group + i), group + i-1, sizeof(*group)); - } - - // Create space at group[offset], without special assumptions about value_type - // and allocator_type. - void set_aux(size_type offset, base::false_type) { - // This is valid because 0 <= offset <= num_buckets - pointer p = allocate_group(settings.num_buckets + 1); - std::uninitialized_copy(group, group + offset, p); - std::uninitialized_copy(group + offset, group + settings.num_buckets, - p + offset + 1); - free_group(); - group = p; - } - - public: - // This returns a reference to the inserted item (which is a copy of val). - // TODO(austern): Make this exception safe: handle exceptions from - // value_type's copy constructor. - reference set(size_type i, const_reference val) { - size_type offset = pos_to_offset(bitmap, i); // where we'll find (or insert) - if ( bmtest(i) ) { - // Delete the old value, which we're replacing with the new one - group[offset].~value_type(); - } else { - typedef base::integral_constant::value && - base::has_trivial_destructor::value && - base::is_same< - allocator_type, - libc_allocator_with_realloc >::value)> - realloc_and_memmove_ok; // we pretend mv(x,y) == "x.~T(); new(x) T(y)" - set_aux(offset, realloc_and_memmove_ok()); - ++settings.num_buckets; - bmset(i); - } - // This does the actual inserting. Since we made the array using - // malloc, we use "placement new" to just call the constructor. - new(&group[offset]) value_type(val); - return group[offset]; - } - - // We let you see if a bucket is non-empty without retrieving it - bool test(size_type i) const { - return bmtest(i) != 0; - } - bool test(iterator pos) const { - return bmtest(pos.pos) != 0; - } - - private: - // Shrink the array, assuming value_type has trivial copy - // constructor and destructor, and the allocator_type is the default - // libc_allocator_with_alloc. (Really, we want it to have "trivial - // move", because that's what realloc and memmove both do. But - // there's no way to capture that using type_traits, so we pretend - // that move(x, y) is equivalent to ""x.~T(); new(x) T(y);" - // which is pretty much correct, if a bit conservative.) - void erase_aux(size_type offset, base::true_type) { - // This isn't technically necessary, since we know we have a - // trivial destructor, but is a cheap way to get a bit more safety. - group[offset].~value_type(); - // This is equivalent to memmove(), but faster on my Intel P4, - // at lesat with gcc4.1 -O2 / glibc 2.3.6. - assert(settings.num_buckets > 0); - for (size_type i = offset; i < settings.num_buckets-1; ++i) - memcpy(static_cast(group + i), group + i+1, sizeof(*group)); // hopefully inlined! - group = settings.realloc_or_die(group, settings.num_buckets-1); - } - - // Shrink the array, without any special assumptions about value_type and - // allocator_type. - void erase_aux(size_type offset, base::false_type) { - // This is valid because 0 <= offset < num_buckets. Note the inequality. - pointer p = allocate_group(settings.num_buckets - 1); - std::uninitialized_copy(group, group + offset, p); - std::uninitialized_copy(group + offset + 1, group + settings.num_buckets, - p + offset); - free_group(); - group = p; - } - - public: - // This takes the specified elements out of the group. This is - // "undefining", rather than "clearing". - // TODO(austern): Make this exception safe: handle exceptions from - // value_type's copy constructor. - void erase(size_type i) { - if ( bmtest(i) ) { // trivial to erase empty bucket - size_type offset = pos_to_offset(bitmap,i); // where we'll find (or insert) - if ( settings.num_buckets == 1 ) { - free_group(); - group = NULL; - } else { - typedef base::integral_constant::value && - base::has_trivial_destructor::value && - base::is_same< - allocator_type, - libc_allocator_with_realloc >::value)> - realloc_and_memmove_ok; // pretend mv(x,y) == "x.~T(); new(x) T(y)" - erase_aux(offset, realloc_and_memmove_ok()); - } - --settings.num_buckets; - bmclear(i); - } - } - - void erase(iterator pos) { - erase(pos.pos); - } - - void erase(iterator start_it, iterator end_it) { - // This could be more efficient, but to do so we'd need to make - // bmclear() clear a range of indices. Doesn't seem worth it. - for ( ; start_it != end_it; ++start_it ) - erase(start_it); - } - - - // I/O - // We support reading and writing groups to disk. We don't store - // the actual array contents (which we don't know how to store), - // just the bitmap and size. Meant to be used with table I/O. - - template bool write_metadata(OUTPUT *fp) const { - // we explicitly set to u_int16_t - assert(sizeof(settings.num_buckets) == 2); - if ( !sparsehash_internal::write_bigendian_number(fp, settings.num_buckets, - 2) ) - return false; - if ( !sparsehash_internal::write_data(fp, bitmap, sizeof(bitmap)) ) - return false; - return true; - } - - // Reading destroys the old group contents! Returns true if all was ok. - template bool read_metadata(INPUT *fp) { - clear(); - if ( !sparsehash_internal::read_bigendian_number(fp, &settings.num_buckets, - 2) ) - return false; - if ( !sparsehash_internal::read_data(fp, bitmap, sizeof(bitmap)) ) - return false; - // We'll allocate the space, but we won't fill it: it will be - // left as uninitialized raw memory. - group = allocate_group(settings.num_buckets); - return true; - } - - // Again, only meaningful if value_type is a POD. - template bool read_nopointer_data(INPUT *fp) { - for ( nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !sparsehash_internal::read_data(fp, &(*it), sizeof(*it)) ) - return false; - } - return true; - } - - // If your keys and values are simple enough, we can write them - // to disk for you. "simple enough" means POD and no pointers. - // However, we don't try to normalize endianness. - template bool write_nopointer_data(OUTPUT *fp) const { - for ( const_nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !sparsehash_internal::write_data(fp, &(*it), sizeof(*it)) ) - return false; - } - return true; - } - - - // Comparisons. We only need to define == and < -- we get - // != > <= >= via relops.h (which we happily included above). - // Note the comparisons are pretty arbitrary: we compare - // values of the first index that isn't equal (using default - // value for empty buckets). - bool operator==(const sparsegroup& x) const { - return ( settings.num_buckets == x.settings.num_buckets && - memcmp(bitmap, x.bitmap, sizeof(bitmap)) == 0 && - std::equal(begin(), end(), x.begin()) ); // from - } - - bool operator<(const sparsegroup& x) const { // also from - return std::lexicographical_compare(begin(), end(), x.begin(), x.end()); - } - bool operator!=(const sparsegroup& x) const { return !(*this == x); } - bool operator<=(const sparsegroup& x) const { return !(x < *this); } - bool operator>(const sparsegroup& x) const { return x < *this; } - bool operator>=(const sparsegroup& x) const { return !(*this < x); } - - private: - template - class alloc_impl : public A { - public: - typedef typename A::pointer pointer; - typedef typename A::size_type size_type; - - // Convert a normal allocator to one that has realloc_or_die() - alloc_impl(const A& a) : A(a) { } - - // realloc_or_die should only be used when using the default - // allocator (libc_allocator_with_realloc). - pointer realloc_or_die(pointer /*ptr*/, size_type /*n*/) { - fprintf(stderr, "realloc_or_die is only supported for " - "libc_allocator_with_realloc\n"); - exit(1); - return NULL; - } - }; - - // A template specialization of alloc_impl for - // libc_allocator_with_realloc that can handle realloc_or_die. - template - class alloc_impl > - : public libc_allocator_with_realloc { - public: - typedef typename libc_allocator_with_realloc::pointer pointer; - typedef typename libc_allocator_with_realloc::size_type size_type; - - alloc_impl(const libc_allocator_with_realloc& a) - : libc_allocator_with_realloc(a) { } - - pointer realloc_or_die(pointer ptr, size_type n) { - pointer retval = this->reallocate(ptr, n); - if (retval == NULL) { - fprintf(stderr, "sparsehash: FATAL ERROR: failed to reallocate " - "%lu elements for ptr %p", static_cast(n), ptr); - exit(1); - } - return retval; - } - }; - - // Package allocator with num_buckets to eliminate memory needed for the - // zero-size allocator. - // If new fields are added to this class, we should add them to - // operator= and swap. - class Settings : public alloc_impl { - public: - Settings(const alloc_impl& a, u_int16_t n = 0) - : alloc_impl(a), num_buckets(n) { } - Settings(const Settings& s) - : alloc_impl(s), num_buckets(s.num_buckets) { } - - u_int16_t num_buckets; // limits GROUP_SIZE to 64K - }; - - // The actual data - pointer group; // (small) array of T's - Settings settings; // allocator and num_buckets - unsigned char bitmap[(GROUP_SIZE-1)/8 + 1]; // fancy math is so we round up -}; - -// We need a global swap as well -template -inline void swap(sparsegroup &x, - sparsegroup &y) { - x.swap(y); -} - -// --------------------------------------------------------------------------- - - -template > -class sparsetable { - private: - typedef typename Alloc::template rebind::other value_alloc_type; - typedef typename Alloc::template rebind< - sparsegroup >::other vector_alloc; - - public: - // Basic types - typedef T value_type; // stolen from stl_vector.h - typedef Alloc allocator_type; - typedef typename value_alloc_type::size_type size_type; - typedef typename value_alloc_type::difference_type difference_type; - typedef typename value_alloc_type::reference reference; - typedef typename value_alloc_type::const_reference const_reference; - typedef typename value_alloc_type::pointer pointer; - typedef typename value_alloc_type::const_pointer const_pointer; - typedef table_iterator > iterator; - typedef const_table_iterator > - const_iterator; - typedef table_element_adaptor > - element_adaptor; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; // from iterator.h - - // These are our special iterators, that go over non-empty buckets in a - // table. These aren't const only because you can change non-empty bcks. - typedef two_d_iterator< std::vector< sparsegroup, - vector_alloc> > - nonempty_iterator; - typedef const_two_d_iterator< std::vector< sparsegroup, - vector_alloc> > - const_nonempty_iterator; - typedef std::reverse_iterator reverse_nonempty_iterator; - typedef std::reverse_iterator const_reverse_nonempty_iterator; - // Another special iterator: it frees memory as it iterates (used to resize) - typedef destructive_two_d_iterator< std::vector< sparsegroup, - vector_alloc> > - destructive_iterator; - - // Iterator functions - iterator begin() { return iterator(this, 0); } - const_iterator begin() const { return const_iterator(this, 0); } - iterator end() { return iterator(this, size()); } - const_iterator end() const { return const_iterator(this, size()); } - reverse_iterator rbegin() { return reverse_iterator(end()); } - const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } - reverse_iterator rend() { return reverse_iterator(begin()); } - const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } - - // Versions for our special non-empty iterator - nonempty_iterator nonempty_begin() { - return nonempty_iterator(groups.begin(), groups.end(), groups.begin()); - } - const_nonempty_iterator nonempty_begin() const { - return const_nonempty_iterator(groups.begin(),groups.end(), groups.begin()); - } - nonempty_iterator nonempty_end() { - return nonempty_iterator(groups.begin(), groups.end(), groups.end()); - } - const_nonempty_iterator nonempty_end() const { - return const_nonempty_iterator(groups.begin(), groups.end(), groups.end()); - } - reverse_nonempty_iterator nonempty_rbegin() { - return reverse_nonempty_iterator(nonempty_end()); - } - const_reverse_nonempty_iterator nonempty_rbegin() const { - return const_reverse_nonempty_iterator(nonempty_end()); - } - reverse_nonempty_iterator nonempty_rend() { - return reverse_nonempty_iterator(nonempty_begin()); - } - const_reverse_nonempty_iterator nonempty_rend() const { - return const_reverse_nonempty_iterator(nonempty_begin()); - } - destructive_iterator destructive_begin() { - return destructive_iterator(groups.begin(), groups.end(), groups.begin()); - } - destructive_iterator destructive_end() { - return destructive_iterator(groups.begin(), groups.end(), groups.end()); - } - - typedef sparsegroup group_type; - typedef std::vector group_vector_type; - - typedef typename group_vector_type::reference GroupsReference; - typedef typename group_vector_type::const_reference GroupsConstReference; - typedef typename group_vector_type::iterator GroupsIterator; - typedef typename group_vector_type::const_iterator GroupsConstIterator; - - // How to deal with the proper group - static size_type num_groups(size_type num) { // how many to hold num buckets - return num == 0 ? 0 : ((num-1) / GROUP_SIZE) + 1; - } - - u_int16_t pos_in_group(size_type i) const { - return static_cast(i % GROUP_SIZE); - } - size_type group_num(size_type i) const { - return i / GROUP_SIZE; - } - GroupsReference which_group(size_type i) { - return groups[group_num(i)]; - } - GroupsConstReference which_group(size_type i) const { - return groups[group_num(i)]; - } - - public: - // Constructors -- default, normal (when you specify size), and copy - explicit sparsetable(size_type sz = 0, Alloc alloc = Alloc()) - : groups(vector_alloc(alloc)), settings(alloc, sz) { - groups.resize(num_groups(sz), group_type(settings)); - } - // We can get away with using the default copy constructor, - // and default destructor, and hence the default operator=. Huzzah! - - // Many STL algorithms use swap instead of copy constructors - void swap(sparsetable& x) { - std::swap(groups, x.groups); // defined in stl_algobase.h - std::swap(settings.table_size, x.settings.table_size); - std::swap(settings.num_buckets, x.settings.num_buckets); - } - - // It's always nice to be able to clear a table without deallocating it - void clear() { - GroupsIterator group; - for ( group = groups.begin(); group != groups.end(); ++group ) { - group->clear(); - } - settings.num_buckets = 0; - } - - // ACCESSOR FUNCTIONS for the things we templatize on, basically - allocator_type get_allocator() const { - return allocator_type(settings); - } - - - // Functions that tell you about size. - // NOTE: empty() is non-intuitive! It does not tell you the number - // of not-empty buckets (use num_nonempty() for that). Instead - // it says whether you've allocated any buckets or not. - size_type size() const { return settings.table_size; } - size_type max_size() const { return settings.max_size(); } - bool empty() const { return settings.table_size == 0; } - // We also may want to know how many *used* buckets there are - size_type num_nonempty() const { return settings.num_buckets; } - - // OK, we'll let you resize one of these puppies - void resize(size_type new_size) { - groups.resize(num_groups(new_size), group_type(settings)); - if ( new_size < settings.table_size) { - // lower num_buckets, clear last group - if ( pos_in_group(new_size) > 0 ) // need to clear inside last group - groups.back().erase(groups.back().begin() + pos_in_group(new_size), - groups.back().end()); - settings.num_buckets = 0; // refigure # of used buckets - GroupsConstIterator group; - for ( group = groups.begin(); group != groups.end(); ++group ) - settings.num_buckets += group->num_nonempty(); - } - settings.table_size = new_size; - } - - - // We let you see if a bucket is non-empty without retrieving it - bool test(size_type i) const { - assert(i < settings.table_size); - return which_group(i).test(pos_in_group(i)); - } - bool test(iterator pos) const { - return which_group(pos.pos).test(pos_in_group(pos.pos)); - } - bool test(const_iterator pos) const { - return which_group(pos.pos).test(pos_in_group(pos.pos)); - } - - // We only return const_references because it's really hard to - // return something settable for empty buckets. Use set() instead. - const_reference get(size_type i) const { - assert(i < settings.table_size); - return which_group(i).get(pos_in_group(i)); - } - - // TODO(csilvers): make protected + friend - // This is used by sparse_hashtable to get an element from the table - // when we know it exists (because the caller has called test(i)). - const_reference unsafe_get(size_type i) const { - assert(i < settings.table_size); - assert(test(i)); - return which_group(i).unsafe_get(pos_in_group(i)); - } - - // TODO(csilvers): make protected + friend element_adaptor - reference mutating_get(size_type i) { // fills bucket i before getting - assert(i < settings.table_size); - typename group_type::size_type old_numbuckets = which_group(i).num_nonempty(); - reference retval = which_group(i).mutating_get(pos_in_group(i)); - settings.num_buckets += which_group(i).num_nonempty() - old_numbuckets; - return retval; - } - - // Syntactic sugar. As in sparsegroup, the non-const version is harder - const_reference operator[](size_type i) const { - return get(i); - } - - element_adaptor operator[](size_type i) { - return element_adaptor(this, i); - } - - // Needed for hashtables, gets as a nonempty_iterator. Crashes for empty bcks - const_nonempty_iterator get_iter(size_type i) const { - assert(test(i)); // how can a nonempty_iterator point to an empty bucket? - return const_nonempty_iterator( - groups.begin(), groups.end(), - groups.begin() + group_num(i), - (groups[group_num(i)].nonempty_begin() + - groups[group_num(i)].pos_to_offset(pos_in_group(i)))); - } - // For nonempty we can return a non-const version - nonempty_iterator get_iter(size_type i) { - assert(test(i)); // how can a nonempty_iterator point to an empty bucket? - return nonempty_iterator( - groups.begin(), groups.end(), - groups.begin() + group_num(i), - (groups[group_num(i)].nonempty_begin() + - groups[group_num(i)].pos_to_offset(pos_in_group(i)))); - } - - // And the reverse transformation. - size_type get_pos(const const_nonempty_iterator it) const { - difference_type current_row = it.row_current - it.row_begin; - difference_type current_col = (it.col_current - - groups[current_row].nonempty_begin()); - return ((current_row * GROUP_SIZE) + - groups[current_row].offset_to_pos(current_col)); - } - - - // This returns a reference to the inserted item (which is a copy of val) - // The trick is to figure out whether we're replacing or inserting anew - reference set(size_type i, const_reference val) { - assert(i < settings.table_size); - typename group_type::size_type old_numbuckets = which_group(i).num_nonempty(); - reference retval = which_group(i).set(pos_in_group(i), val); - settings.num_buckets += which_group(i).num_nonempty() - old_numbuckets; - return retval; - } - - // This takes the specified elements out of the table. This is - // "undefining", rather than "clearing". - void erase(size_type i) { - assert(i < settings.table_size); - typename group_type::size_type old_numbuckets = which_group(i).num_nonempty(); - which_group(i).erase(pos_in_group(i)); - settings.num_buckets += which_group(i).num_nonempty() - old_numbuckets; - } - - void erase(iterator pos) { - erase(pos.pos); - } - - void erase(iterator start_it, iterator end_it) { - // This could be more efficient, but then we'd need to figure - // out if we spanned groups or not. Doesn't seem worth it. - for ( ; start_it != end_it; ++start_it ) - erase(start_it); - } - - - // We support reading and writing tables to disk. We don't store - // the actual array contents (which we don't know how to store), - // just the groups and sizes. Returns true if all went ok. - - private: - // Every time the disk format changes, this should probably change too - typedef unsigned long MagicNumberType; - static const MagicNumberType MAGIC_NUMBER = 0x24687531; - - // Old versions of this code write all data in 32 bits. We need to - // support these files as well as having support for 64-bit systems. - // So we use the following encoding scheme: for values < 2^32-1, we - // store in 4 bytes in big-endian order. For values > 2^32, we - // store 0xFFFFFFF followed by 8 bytes in big-endian order. This - // causes us to mis-read old-version code that stores exactly - // 0xFFFFFFF, but I don't think that is likely to have happened for - // these particular values. - template - static bool write_32_or_64(OUTPUT* fp, IntType value) { - if ( value < 0xFFFFFFFFULL ) { // fits in 4 bytes - if ( !sparsehash_internal::write_bigendian_number(fp, value, 4) ) - return false; - } else { - if ( !sparsehash_internal::write_bigendian_number(fp, 0xFFFFFFFFUL, 4) ) - return false; - if ( !sparsehash_internal::write_bigendian_number(fp, value, 8) ) - return false; - } - return true; - } - - template - static bool read_32_or_64(INPUT* fp, IntType *value) { // reads into value - MagicNumberType first4 = 0; // a convenient 32-bit unsigned type - if ( !sparsehash_internal::read_bigendian_number(fp, &first4, 4) ) - return false; - if ( first4 < 0xFFFFFFFFULL ) { - *value = first4; - } else { - if ( !sparsehash_internal::read_bigendian_number(fp, value, 8) ) - return false; - } - return true; - } - - public: - // read/write_metadata() and read_write/nopointer_data() are DEPRECATED. - // Use serialize() and unserialize(), below, for new code. - - template bool write_metadata(OUTPUT *fp) const { - if ( !write_32_or_64(fp, MAGIC_NUMBER) ) return false; - if ( !write_32_or_64(fp, settings.table_size) ) return false; - if ( !write_32_or_64(fp, settings.num_buckets) ) return false; - - GroupsConstIterator group; - for ( group = groups.begin(); group != groups.end(); ++group ) - if ( group->write_metadata(fp) == false ) return false; - return true; - } - - // Reading destroys the old table contents! Returns true if read ok. - template bool read_metadata(INPUT *fp) { - size_type magic_read = 0; - if ( !read_32_or_64(fp, &magic_read) ) return false; - if ( magic_read != MAGIC_NUMBER ) { - clear(); // just to be consistent - return false; - } - - if ( !read_32_or_64(fp, &settings.table_size) ) return false; - if ( !read_32_or_64(fp, &settings.num_buckets) ) return false; - - resize(settings.table_size); // so the vector's sized ok - GroupsIterator group; - for ( group = groups.begin(); group != groups.end(); ++group ) - if ( group->read_metadata(fp) == false ) return false; - return true; - } - - // This code is identical to that for SparseGroup - // If your keys and values are simple enough, we can write them - // to disk for you. "simple enough" means no pointers. - // However, we don't try to normalize endianness - bool write_nopointer_data(FILE *fp) const { - for ( const_nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !fwrite(&*it, sizeof(*it), 1, fp) ) return false; - } - return true; - } - - // When reading, we have to override the potential const-ness of *it - bool read_nopointer_data(FILE *fp) { - for ( nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !fread(reinterpret_cast(&(*it)), sizeof(*it), 1, fp) ) - return false; - } - return true; - } - - // INPUT and OUTPUT must be either a FILE, *or* a C++ stream - // (istream, ostream, etc) *or* a class providing - // Read(void*, size_t) and Write(const void*, size_t) - // (respectively), which writes a buffer into a stream - // (which the INPUT/OUTPUT instance presumably owns). - - typedef sparsehash_internal::pod_serializer NopointerSerializer; - - // ValueSerializer: a functor. operator()(OUTPUT*, const value_type&) - template - bool serialize(ValueSerializer serializer, OUTPUT *fp) { - if ( !write_metadata(fp) ) - return false; - for ( const_nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !serializer(fp, *it) ) return false; - } - return true; - } - - // ValueSerializer: a functor. operator()(INPUT*, value_type*) - template - bool unserialize(ValueSerializer serializer, INPUT *fp) { - clear(); - if ( !read_metadata(fp) ) - return false; - for ( nonempty_iterator it = nonempty_begin(); - it != nonempty_end(); ++it ) { - if ( !serializer(fp, &*it) ) return false; - } - return true; - } - - // Comparisons. Note the comparisons are pretty arbitrary: we - // compare values of the first index that isn't equal (using default - // value for empty buckets). - bool operator==(const sparsetable& x) const { - return ( settings.table_size == x.settings.table_size && - settings.num_buckets == x.settings.num_buckets && - groups == x.groups ); - } - - bool operator<(const sparsetable& x) const { - return std::lexicographical_compare(begin(), end(), x.begin(), x.end()); - } - bool operator!=(const sparsetable& x) const { return !(*this == x); } - bool operator<=(const sparsetable& x) const { return !(x < *this); } - bool operator>(const sparsetable& x) const { return x < *this; } - bool operator>=(const sparsetable& x) const { return !(*this < x); } - - - private: - // Package allocator with table_size and num_buckets to eliminate memory - // needed for the zero-size allocator. - // If new fields are added to this class, we should add them to - // operator= and swap. - class Settings : public allocator_type { - public: - typedef typename allocator_type::size_type size_type; - - Settings(const allocator_type& a, size_type sz = 0, size_type n = 0) - : allocator_type(a), table_size(sz), num_buckets(n) { } - - Settings(const Settings& s) - : allocator_type(s), - table_size(s.table_size), num_buckets(s.num_buckets) { } - - size_type table_size; // how many buckets they want - size_type num_buckets; // number of non-empty buckets - }; - - // The actual data - group_vector_type groups; // our list of groups - Settings settings; // allocator, table size, buckets -}; - -// We need a global swap as well -template -inline void swap(sparsetable &x, - sparsetable &y) { - x.swap(y); -} - -_END_GOOGLE_NAMESPACE_ - -#endif // UTIL_GTL_SPARSETABLE_H_ diff --git a/contrib/libsparsehash/sparsehash/template_util.h b/contrib/libsparsehash/sparsehash/template_util.h deleted file mode 100644 index 6fec3d09243..00000000000 --- a/contrib/libsparsehash/sparsehash/template_util.h +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2005 Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// ---- -// -// Template metaprogramming utility functions. -// -// This code is compiled directly on many platforms, including client -// platforms like Windows, Mac, and embedded systems. Before making -// any changes here, make sure that you're not breaking any platforms. -// -// -// The names choosen here reflect those used in tr1 and the boost::mpl -// library, there are similar operations used in the Loki library as -// well. I prefer the boost names for 2 reasons: -// 1. I think that portions of the Boost libraries are more likely to -// be included in the c++ standard. -// 2. It is not impossible that some of the boost libraries will be -// included in our own build in the future. -// Both of these outcomes means that we may be able to directly replace -// some of these with boost equivalents. -// -#ifndef BASE_TEMPLATE_UTIL_H_ -#define BASE_TEMPLATE_UTIL_H_ - -#include -_START_GOOGLE_NAMESPACE_ - -// Types small_ and big_ are guaranteed such that sizeof(small_) < -// sizeof(big_) -typedef char small_; - -struct big_ { - char dummy[2]; -}; - -// Identity metafunction. -template -struct identity_ { - typedef T type; -}; - -// integral_constant, defined in tr1, is a wrapper for an integer -// value. We don't really need this generality; we could get away -// with hardcoding the integer type to bool. We use the fully -// general integer_constant for compatibility with tr1. - -template -struct integral_constant { - static const T value = v; - typedef T value_type; - typedef integral_constant type; -}; - -template const T integral_constant::value; - - -// Abbreviations: true_type and false_type are structs that represent boolean -// true and false values. Also define the boost::mpl versions of those names, -// true_ and false_. -typedef integral_constant true_type; -typedef integral_constant false_type; -typedef true_type true_; -typedef false_type false_; - -// if_ is a templatized conditional statement. -// if_ is a compile time evaluation of cond. -// if_<>::type contains A if cond is true, B otherwise. -template -struct if_{ - typedef A type; -}; - -template -struct if_ { - typedef B type; -}; - - -// type_equals_ is a template type comparator, similar to Loki IsSameType. -// type_equals_::value is true iff "A" is the same type as "B". -// -// New code should prefer base::is_same, defined in base/type_traits.h. -// It is functionally identical, but is_same is the standard spelling. -template -struct type_equals_ : public false_ { -}; - -template -struct type_equals_ : public true_ { -}; - -// and_ is a template && operator. -// and_::value evaluates "A::value && B::value". -template -struct and_ : public integral_constant { -}; - -// or_ is a template || operator. -// or_::value evaluates "A::value || B::value". -template -struct or_ : public integral_constant { -}; - - -_END_GOOGLE_NAMESPACE_ - -#endif // BASE_TEMPLATE_UTIL_H_ diff --git a/contrib/libsparsehash/sparsehash/type_traits.h b/contrib/libsparsehash/sparsehash/type_traits.h deleted file mode 100644 index f909cf9a374..00000000000 --- a/contrib/libsparsehash/sparsehash/type_traits.h +++ /dev/null @@ -1,342 +0,0 @@ -// Copyright (c) 2006, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// ---- -// -// This code is compiled directly on many platforms, including client -// platforms like Windows, Mac, and embedded systems. Before making -// any changes here, make sure that you're not breaking any platforms. -// -// Define a small subset of tr1 type traits. The traits we define are: -// is_integral -// is_floating_point -// is_pointer -// is_enum -// is_reference -// is_pod -// has_trivial_constructor -// has_trivial_copy -// has_trivial_assign -// has_trivial_destructor -// remove_const -// remove_volatile -// remove_cv -// remove_reference -// add_reference -// remove_pointer -// is_same -// is_convertible -// We can add more type traits as required. - -#ifndef BASE_TYPE_TRAITS_H_ -#define BASE_TYPE_TRAITS_H_ - -#include -#include // For pair - -#include // For true_type and false_type - -_START_GOOGLE_NAMESPACE_ - -template struct is_integral; -template struct is_floating_point; -template struct is_pointer; -// MSVC can't compile this correctly, and neither can gcc 3.3.5 (at least) -#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) -// is_enum uses is_convertible, which is not available on MSVC. -template struct is_enum; -#endif -template struct is_reference; -template struct is_pod; -template struct has_trivial_constructor; -template struct has_trivial_copy; -template struct has_trivial_assign; -template struct has_trivial_destructor; -template struct remove_const; -template struct remove_volatile; -template struct remove_cv; -template struct remove_reference; -template struct add_reference; -template struct remove_pointer; -template struct is_same; -#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) -template struct is_convertible; -#endif - -// is_integral is false except for the built-in integer types. A -// cv-qualified type is integral if and only if the underlying type is. -template struct is_integral : false_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -#if defined(_MSC_VER) -// wchar_t is not by default a distinct type from unsigned short in -// Microsoft C. -// See http://msdn2.microsoft.com/en-us/library/dh8che7s(VS.80).aspx -template<> struct is_integral<__wchar_t> : true_type { }; -#else -template<> struct is_integral : true_type { }; -#endif -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -#ifdef HAVE_LONG_LONG -template<> struct is_integral : true_type { }; -template<> struct is_integral : true_type { }; -#endif -template struct is_integral : is_integral { }; -template struct is_integral : is_integral { }; -template struct is_integral : is_integral { }; - -// is_floating_point is false except for the built-in floating-point types. -// A cv-qualified type is integral if and only if the underlying type is. -template struct is_floating_point : false_type { }; -template<> struct is_floating_point : true_type { }; -template<> struct is_floating_point : true_type { }; -template<> struct is_floating_point : true_type { }; -template struct is_floating_point - : is_floating_point { }; -template struct is_floating_point - : is_floating_point { }; -template struct is_floating_point - : is_floating_point { }; - -// is_pointer is false except for pointer types. A cv-qualified type (e.g. -// "int* const", as opposed to "int const*") is cv-qualified if and only if -// the underlying type is. -template struct is_pointer : false_type { }; -template struct is_pointer : true_type { }; -template struct is_pointer : is_pointer { }; -template struct is_pointer : is_pointer { }; -template struct is_pointer : is_pointer { }; - -#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) - -namespace internal { - -template struct is_class_or_union { - template static small_ tester(void (U::*)()); - template static big_ tester(...); - static const bool value = sizeof(tester(0)) == sizeof(small_); -}; - -// is_convertible chokes if the first argument is an array. That's why -// we use add_reference here. -template struct is_enum_impl - : is_convertible::type, int> { }; - -template struct is_enum_impl : false_type { }; - -} // namespace internal - -// Specified by TR1 [4.5.1] primary type categories. - -// Implementation note: -// -// Each type is either void, integral, floating point, array, pointer, -// reference, member object pointer, member function pointer, enum, -// union or class. Out of these, only integral, floating point, reference, -// class and enum types are potentially convertible to int. Therefore, -// if a type is not a reference, integral, floating point or class and -// is convertible to int, it's a enum. Adding cv-qualification to a type -// does not change whether it's an enum. -// -// Is-convertible-to-int check is done only if all other checks pass, -// because it can't be used with some types (e.g. void or classes with -// inaccessible conversion operators). -template struct is_enum - : internal::is_enum_impl< - is_same::value || - is_integral::value || - is_floating_point::value || - is_reference::value || - internal::is_class_or_union::value, - T> { }; - -template struct is_enum : is_enum { }; -template struct is_enum : is_enum { }; -template struct is_enum : is_enum { }; - -#endif - -// is_reference is false except for reference types. -template struct is_reference : false_type {}; -template struct is_reference : true_type {}; - - -// We can't get is_pod right without compiler help, so fail conservatively. -// We will assume it's false except for arithmetic types, enumerations, -// pointers and cv-qualified versions thereof. Note that std::pair -// is not a POD even if T and U are PODs. -template struct is_pod - : integral_constant::value || - is_floating_point::value || -#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) - // is_enum is not available on MSVC. - is_enum::value || -#endif - is_pointer::value)> { }; -template struct is_pod : is_pod { }; -template struct is_pod : is_pod { }; -template struct is_pod : is_pod { }; - - -// We can't get has_trivial_constructor right without compiler help, so -// fail conservatively. We will assume it's false except for: (1) types -// for which is_pod is true. (2) std::pair of types with trivial -// constructors. (3) array of a type with a trivial constructor. -// (4) const versions thereof. -template struct has_trivial_constructor : is_pod { }; -template struct has_trivial_constructor > - : integral_constant::value && - has_trivial_constructor::value)> { }; -template struct has_trivial_constructor - : has_trivial_constructor { }; -template struct has_trivial_constructor - : has_trivial_constructor { }; - -// We can't get has_trivial_copy right without compiler help, so fail -// conservatively. We will assume it's false except for: (1) types -// for which is_pod is true. (2) std::pair of types with trivial copy -// constructors. (3) array of a type with a trivial copy constructor. -// (4) const versions thereof. -template struct has_trivial_copy : is_pod { }; -template struct has_trivial_copy > - : integral_constant::value && - has_trivial_copy::value)> { }; -template struct has_trivial_copy - : has_trivial_copy { }; -template struct has_trivial_copy : has_trivial_copy { }; - -// We can't get has_trivial_assign right without compiler help, so fail -// conservatively. We will assume it's false except for: (1) types -// for which is_pod is true. (2) std::pair of types with trivial copy -// constructors. (3) array of a type with a trivial assign constructor. -template struct has_trivial_assign : is_pod { }; -template struct has_trivial_assign > - : integral_constant::value && - has_trivial_assign::value)> { }; -template struct has_trivial_assign - : has_trivial_assign { }; - -// We can't get has_trivial_destructor right without compiler help, so -// fail conservatively. We will assume it's false except for: (1) types -// for which is_pod is true. (2) std::pair of types with trivial -// destructors. (3) array of a type with a trivial destructor. -// (4) const versions thereof. -template struct has_trivial_destructor : is_pod { }; -template struct has_trivial_destructor > - : integral_constant::value && - has_trivial_destructor::value)> { }; -template struct has_trivial_destructor - : has_trivial_destructor { }; -template struct has_trivial_destructor - : has_trivial_destructor { }; - -// Specified by TR1 [4.7.1] -template struct remove_const { typedef T type; }; -template struct remove_const { typedef T type; }; -template struct remove_volatile { typedef T type; }; -template struct remove_volatile { typedef T type; }; -template struct remove_cv { - typedef typename remove_const::type>::type type; -}; - - -// Specified by TR1 [4.7.2] Reference modifications. -template struct remove_reference { typedef T type; }; -template struct remove_reference { typedef T type; }; - -template struct add_reference { typedef T& type; }; -template struct add_reference { typedef T& type; }; - -// Specified by TR1 [4.7.4] Pointer modifications. -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { - typedef T type; }; - -// Specified by TR1 [4.6] Relationships between types -template struct is_same : public false_type { }; -template struct is_same : public true_type { }; - -// Specified by TR1 [4.6] Relationships between types -#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) -namespace internal { - -// This class is an implementation detail for is_convertible, and you -// don't need to know how it works to use is_convertible. For those -// who care: we declare two different functions, one whose argument is -// of type To and one with a variadic argument list. We give them -// return types of different size, so we can use sizeof to trick the -// compiler into telling us which function it would have chosen if we -// had called it with an argument of type From. See Alexandrescu's -// _Modern C++ Design_ for more details on this sort of trick. - -template -struct ConvertHelper { - static small_ Test(To); - static big_ Test(...); - static From Create(); -}; -} // namespace internal - -// Inherits from true_type if From is convertible to To, false_type otherwise. -template -struct is_convertible - : integral_constant::Test( - internal::ConvertHelper::Create())) - == sizeof(small_)> { -}; -#endif - -_END_GOOGLE_NAMESPACE_ - -// Right now these macros are no-ops, and mostly just document the fact -// these types are PODs, for human use. They may be made more contentful -// later. The typedef is just to make it legal to put a semicolon after -// these macros. -#define DECLARE_POD(TypeName) typedef int Dummy_Type_For_DECLARE_POD -#define DECLARE_NESTED_POD(TypeName) DECLARE_POD(TypeName) -#define PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT(TemplateName) \ - typedef int Dummy_Type_For_PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT -#define ENFORCE_POD(TypeName) typedef int Dummy_Type_For_ENFORCE_POD - -#endif // BASE_TYPE_TRAITS_H_ diff --git a/contrib/sparsehash-c11 b/contrib/sparsehash-c11 new file mode 160000 index 00000000000..cf0bffaa456 --- /dev/null +++ b/contrib/sparsehash-c11 @@ -0,0 +1 @@ +Subproject commit cf0bffaa456f23bc4174462a789b90f8b6f5f42f diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 2c8f99b9738..b2cbef3b983 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -176,6 +176,7 @@ add_object_library(clickhouse_processors_formats_impl src/Processors/Formats/Imp add_object_library(clickhouse_processors_transforms src/Processors/Transforms) add_object_library(clickhouse_processors_sources src/Processors/Sources) + if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) add_library (dbms STATIC ${dbms_headers} ${dbms_sources}) set (all_modules dbms) @@ -394,7 +395,7 @@ if (OPENSSL_CRYPTO_LIBRARY) endif () dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${DIVIDE_INCLUDE_DIR}) -dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) if (USE_PROTOBUF) dbms_target_link_libraries (PRIVATE ${Protobuf_LIBRARY}) diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index 67c0e376f74..28fe4d49667 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -61,7 +61,7 @@ add_executable (space_saving space_saving.cpp) target_link_libraries (space_saving PRIVATE clickhouse_common_io) add_executable (integer_hash_tables_and_hashes integer_hash_tables_and_hashes.cpp) -target_include_directories (integer_hash_tables_and_hashes SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (integer_hash_tables_and_hashes SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (integer_hash_tables_and_hashes PRIVATE dbms) add_executable (allocator allocator.cpp) diff --git a/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp b/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp index d6ae055cc5e..091eb4acb93 100644 --- a/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp +++ b/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp @@ -331,8 +331,8 @@ void NO_INLINE testForEachMapAndHash(const Key * data, size_t size) testForEachHash(data, size, nothing); testForEachHash(data, size, nothing); - testForEachHash(data, size, [](auto & map){ map.set_empty_key(-1); }); - testForEachHash(data, size, nothing); + testForEachHash<::google::dense_hash_map>(data, size, [](auto & map){ map.set_empty_key(-1); }); + testForEachHash<::google::sparse_hash_map>(data, size, nothing); } diff --git a/dbms/src/Core/NamesAndTypes.cpp b/dbms/src/Core/NamesAndTypes.cpp index 15c2be689bf..d76f457fd1a 100644 --- a/dbms/src/Core/NamesAndTypes.cpp +++ b/dbms/src/Core/NamesAndTypes.cpp @@ -138,7 +138,7 @@ NamesAndTypesList NamesAndTypesList::filter(const Names & names) const NamesAndTypesList NamesAndTypesList::addTypes(const Names & names) const { /// NOTE It's better to make a map in `IStorage` than to create it here every time again. - GOOGLE_NAMESPACE::dense_hash_map types; + ::google::dense_hash_map types; types.set_empty_key(StringRef()); for (const NameAndTypePair & column : *this) diff --git a/dbms/src/Core/tests/CMakeLists.txt b/dbms/src/Core/tests/CMakeLists.txt index 7f177978d26..04b8723e91b 100644 --- a/dbms/src/Core/tests/CMakeLists.txt +++ b/dbms/src/Core/tests/CMakeLists.txt @@ -1,6 +1,6 @@ add_executable (string_pool string_pool.cpp) target_link_libraries (string_pool PRIVATE clickhouse_common_io) -target_include_directories (string_pool SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (string_pool SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) add_executable (field field.cpp) target_link_libraries (field PRIVATE dbms) diff --git a/dbms/src/Core/tests/string_pool.cpp b/dbms/src/Core/tests/string_pool.cpp index 04fa33f3abb..4f792860029 100644 --- a/dbms/src/Core/tests/string_pool.cpp +++ b/dbms/src/Core/tests/string_pool.cpp @@ -33,8 +33,8 @@ int main(int argc, char ** argv) using Vec = std::vector; using Set = std::unordered_map; using RefsSet = std::unordered_map; - using DenseSet = GOOGLE_NAMESPACE::dense_hash_map; - using RefsDenseSet = GOOGLE_NAMESPACE::dense_hash_map; + using DenseSet = ::google::dense_hash_map; + using RefsDenseSet = ::google::dense_hash_map; using RefsHashMap = HashMap; Vec vec; diff --git a/dbms/src/Interpreters/tests/CMakeLists.txt b/dbms/src/Interpreters/tests/CMakeLists.txt index 03c06eb7257..8267b544fd6 100644 --- a/dbms/src/Interpreters/tests/CMakeLists.txt +++ b/dbms/src/Interpreters/tests/CMakeLists.txt @@ -11,11 +11,11 @@ add_executable (aggregate aggregate.cpp) target_link_libraries (aggregate PRIVATE dbms) add_executable (hash_map hash_map.cpp) -target_include_directories (hash_map SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (hash_map PRIVATE dbms) add_executable (hash_map_lookup hash_map_lookup.cpp) -target_include_directories (hash_map_lookup SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (hash_map_lookup SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (hash_map_lookup PRIVATE dbms) add_executable (hash_map3 hash_map3.cpp) @@ -23,7 +23,7 @@ target_include_directories(hash_map3 SYSTEM BEFORE PRIVATE ${METROHASH_INCLUDE_D target_link_libraries (hash_map3 PRIVATE dbms ${FARMHASH_LIBRARIES} ${METROHASH_LIBRARIES}) add_executable (hash_map_string hash_map_string.cpp) -target_include_directories (hash_map_string SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (hash_map_string SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (hash_map_string PRIVATE dbms) add_executable (hash_map_string_2 hash_map_string_2.cpp) @@ -34,11 +34,11 @@ target_include_directories(hash_map_string_3 SYSTEM BEFORE PRIVATE ${METROHASH_I target_link_libraries (hash_map_string_3 PRIVATE dbms ${FARMHASH_LIBRARIES} ${METROHASH_LIBRARIES}) add_executable (hash_map_string_small hash_map_string_small.cpp) -target_include_directories (hash_map_string_small SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (hash_map_string_small SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (hash_map_string_small PRIVATE dbms) add_executable (two_level_hash_map two_level_hash_map.cpp) -target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR}) +target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (two_level_hash_map PRIVATE dbms) add_executable (logical_expressions_optimizer logical_expressions_optimizer.cpp) diff --git a/dbms/src/Interpreters/tests/hash_map.cpp b/dbms/src/Interpreters/tests/hash_map.cpp index 275cd367179..0bbabab8632 100644 --- a/dbms/src/Interpreters/tests/hash_map.cpp +++ b/dbms/src/Interpreters/tests/hash_map.cpp @@ -267,8 +267,8 @@ int main(int argc, char ** argv) { Stopwatch watch; - GOOGLE_NAMESPACE::dense_hash_map> map; - GOOGLE_NAMESPACE::dense_hash_map>::iterator it; + ::google::dense_hash_map> map; + ::google::dense_hash_map>::iterator it; map.set_empty_key(-1ULL); for (size_t i = 0; i < n; ++i) { @@ -288,8 +288,8 @@ int main(int argc, char ** argv) { Stopwatch watch; - GOOGLE_NAMESPACE::sparse_hash_map> map; - GOOGLE_NAMESPACE::sparse_hash_map>::iterator it; + ::google::sparse_hash_map> map; + ::google::sparse_hash_map>::iterator it; for (size_t i = 0; i < n; ++i) { map.insert(std::make_pair(data[i], value)); diff --git a/dbms/src/Storages/IStorage.cpp b/dbms/src/Storages/IStorage.cpp index cbd14666006..f614ff8dc50 100644 --- a/dbms/src/Storages/IStorage.cpp +++ b/dbms/src/Storages/IStorage.cpp @@ -120,8 +120,8 @@ Block IStorage::getSampleBlockForColumns(const Names & column_names) const namespace { - using NamesAndTypesMap = GOOGLE_NAMESPACE::dense_hash_map; - using UniqueStrings = GOOGLE_NAMESPACE::dense_hash_set; + using NamesAndTypesMap = ::google::dense_hash_map; + using UniqueStrings = ::google::dense_hash_set; String listOfColumns(const NamesAndTypesList & available_columns) { diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index a48fdbd8895..381ed3c9a7c 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -62,11 +62,9 @@ RUN apt-get --allow-unauthenticated update -y \ zlib1g-dev \ liblz4-dev \ libdouble-conversion-dev \ - libsparsehash-dev \ librdkafka-dev \ libpoconetssl50 \ libpoco-dev \ - libsparsehash-dev \ libgoogle-perftools-dev \ libzstd-dev \ libltdl-dev \ diff --git a/utils/build/build_debian.sh b/utils/build/build_debian.sh index 56dd3f2234d..0e69cd0347e 100755 --- a/utils/build/build_debian.sh +++ b/utils/build/build_debian.sh @@ -8,7 +8,7 @@ # install compiler and libs sudo apt install -y git bash cmake ninja-build gcc-7 g++-7 libicu-dev libreadline-dev gperf # for -DUNBUNDLED=1 mode: -#sudo apt install -y libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libzstd-dev libre2-dev libsparsehash-dev librdkafka-dev libcapnp-dev libpoco-dev libsparsehash-dev libgoogle-perftools-dev libunwind-dev googletest libcctz-dev +#sudo apt install -y libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libzstd-dev libre2-dev librdkafka-dev libcapnp-dev libpoco-dev libgoogle-perftools-dev libunwind-dev googletest libcctz-dev # install testing only stuff if you want: sudo apt install -y expect python python-lxml python-termcolor python-requests curl perl sudo openssl netcat-openbsd telnet From 460ccb04a29863ca2df258e40f473a35715ebd53 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Wed, 18 Sep 2019 16:30:18 +0800 Subject: [PATCH 069/102] added bitmapMin and bitmapMax --- .../AggregateFunctionGroupBitmapData.h | 42 +++++++++++++++ dbms/src/Functions/FunctionsBitmap.cpp | 2 + dbms/src/Functions/FunctionsBitmap.h | 53 ++++++++++++++++--- .../00829_bitmap_function.reference | 10 ++++ .../0_stateless/00829_bitmap_function.sql | 24 +++++++++ .../functions/bitmap_functions.md | 50 +++++++++++++++++ .../functions/bitmap_functions.md | 48 +++++++++++++++++ 7 files changed, 223 insertions(+), 6 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index c85a74af1ce..6243e60c9c5 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -495,6 +495,48 @@ public: return count; } + UInt64 rb_min() const + { + UInt64 min_val = UINT32_MAX; + if (isSmall()) + { + for (const auto & x : small) + { + T val = x.getValue(); + if ((UInt64)val < min_val) + { + min_val = (UInt64)val; + } + } + } + else + { + min_val = (UInt64)roaring_bitmap_minimum(rb); + } + return min_val; + } + + UInt64 rb_max() const + { + UInt64 max_val = 0; + if (isSmall()) + { + for (const auto & x : small) + { + T val = x.getValue(); + if ((UInt64)val > max_val) + { + max_val = (UInt64)val; + } + } + } + else + { + max_val = (UInt64)roaring_bitmap_maximum(rb); + } + return max_val; + } + private: /// To read and write the DB Buffer directly, migrate code from CRoaring void db_roaring_bitmap_add_many(DB::ReadBuffer & dbBuf, roaring_bitmap_t * r, size_t n_args) diff --git a/dbms/src/Functions/FunctionsBitmap.cpp b/dbms/src/Functions/FunctionsBitmap.cpp index b2bb1e4f25a..b24e9cdbd9f 100644 --- a/dbms/src/Functions/FunctionsBitmap.cpp +++ b/dbms/src/Functions/FunctionsBitmap.cpp @@ -12,6 +12,8 @@ void registerFunctionsBitmap(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsBitmap.h b/dbms/src/Functions/FunctionsBitmap.h index 86aa022cdb5..ad4f16b16ef 100644 --- a/dbms/src/Functions/FunctionsBitmap.h +++ b/dbms/src/Functions/FunctionsBitmap.h @@ -49,6 +49,12 @@ namespace ErrorCodes * Retrun bitmap cardinality: * bitmapCardinality: bitmap -> integer * + * Retrun smallest value in the set: + * bitmapMin: bitmap -> integer + * + * Retrun the greatest value in the set: + * bitmapMax: bitmap -> integer + * * Two bitmap and calculation, return cardinality: * bitmapAndCardinality: bitmap,bitmap -> integer * @@ -357,13 +363,13 @@ private: } }; -template +template class FunctionBitmapSelfCardinalityImpl : public IFunction { public: - static constexpr auto name = Name::name; + static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(const Context &) { return std::make_shared>(); } String getName() const override { return name; } @@ -417,13 +423,46 @@ private: = typeid_cast(block.getByPosition(arguments[0]).column.get()); for (size_t i = 0; i < input_rows_count; ++i) { - const AggregateFunctionGroupBitmapData & bd1 + const AggregateFunctionGroupBitmapData & bd = *reinterpret_cast *>(column->getData()[i]); - vec_to[i] = bd1.rbs.size(); + vec_to[i] = Impl::apply(bd); } } }; +struct BitmapCardinalityImpl +{ +public: + static constexpr auto name = "bitmapCardinality"; + template + static UInt64 apply(const AggregateFunctionGroupBitmapData & bd) + { + return bd.rbs.size(); + } +}; + +struct BitmapMinImpl +{ +public: + static constexpr auto name = "bitmapMin"; + template + static UInt64 apply(const AggregateFunctionGroupBitmapData & bd) + { + return bd.rbs.rb_min(); + } +}; + +struct BitmapMaxImpl +{ +public: + static constexpr auto name = "bitmapMax"; + template + static UInt64 apply(const AggregateFunctionGroupBitmapData & bd) + { + return bd.rbs.rb_max(); + } +}; + template struct BitmapAndCardinalityImpl { @@ -840,7 +879,9 @@ struct NameBitmapHasAny static constexpr auto name = "bitmapHasAny"; }; -using FunctionBitmapSelfCardinality = FunctionBitmapSelfCardinalityImpl; +using FunctionBitmapSelfCardinality = FunctionBitmapSelfCardinalityImpl; +using FunctionBitmapMin = FunctionBitmapSelfCardinalityImpl; +using FunctionBitmapMax = FunctionBitmapSelfCardinalityImpl; using FunctionBitmapAndCardinality = FunctionBitmapCardinality; using FunctionBitmapOrCardinality = FunctionBitmapCardinality; using FunctionBitmapXorCardinality = FunctionBitmapCardinality; diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference index da1206bab12..3edcd0e1214 100644 --- a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference +++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference @@ -67,3 +67,13 @@ [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33] [30,31,32,33,100] [100] +4294967295 +4294967295 +4294967295 +1 +0 +0 +0 +0 +9 +500 diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql index bbe3e8f80fb..82e1030c036 100644 --- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql +++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql @@ -211,3 +211,27 @@ select bitmapToArray(bitmapSubsetInRange(bitmapBuild([ select bitmapToArray(bitmapSubsetInRange(bitmapBuild([ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, 100,200,500]), toUInt32(100), toUInt32(200))); + +-- bitmapMin: +---- Empty +SELECT bitmapMin(bitmapBuild(emptyArrayUInt8())); +SELECT bitmapMin(bitmapBuild(emptyArrayUInt16())); +SELECT bitmapMin(bitmapBuild(emptyArrayUInt32())); +---- Small +select bitmapMin(bitmapBuild([1,5,7,9])); +---- Large +select bitmapMin(bitmapBuild([ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, + 100,200,500])); + +-- bitmapMax: +---- Empty +SELECT bitmapMax(bitmapBuild(emptyArrayUInt8())); +SELECT bitmapMax(bitmapBuild(emptyArrayUInt16())); +SELECT bitmapMax(bitmapBuild(emptyArrayUInt32())); +---- Small +select bitmapMax(bitmapBuild([1,5,7,9])); +---- Large +select bitmapMax(bitmapBuild([ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, + 100,200,500])); diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md index b0d21500035..fdc2e8a7a0d 100644 --- a/docs/en/query_language/functions/bitmap_functions.md +++ b/docs/en/query_language/functions/bitmap_functions.md @@ -292,6 +292,56 @@ SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res └─────┘ ``` +## bitmapMin + +Retrun smallest value of type UInt64 in the set, UINT32_MAX if the set is empty. + + +``` +bitmapMin(bitmap) +``` + +**Parameters** + +- `bitmap` – bitmap object. + +**Example** + +``` sql +SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapMax + +Retrun smallest value of type UInt64 in the set, 0 if the set is empty. + + +``` +bitmapMax(bitmap) +``` + +**Parameters** + +- `bitmap` – bitmap object. + +**Example** + +``` sql +SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` +┌─res─┐ +│ 5 │ +└─────┘ +``` + ## bitmapAndCardinality Two bitmap and calculation, return cardinality of type UInt64. diff --git a/docs/zh/query_language/functions/bitmap_functions.md b/docs/zh/query_language/functions/bitmap_functions.md index 97be4f38853..b727a4aba16 100644 --- a/docs/zh/query_language/functions/bitmap_functions.md +++ b/docs/zh/query_language/functions/bitmap_functions.md @@ -276,6 +276,54 @@ SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res └─────┘ ``` +## bitmapMin + +返回一个UInt64类型的数值,表示位图中的最小值。如果位图为空则返回UINT32_MAX。 + +``` +bitmapMin(bitmap) +``` + +**Parameters** + +- `bitmap` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapMax + +返回一个UInt64类型的数值,表示位图中的最大值。如果位图为空则返回0。 + +``` +bitmapMax(bitmap) +``` + +**Parameters** + +- `bitmap` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` +┌─res─┐ +│ 5 │ +└─────┘ +``` + ## bitmapAndCardinality 为两个位图对象进行与操作,返回结果位图的基数。 From d98fe1d81e2ffc05b08bfce24c9902dd9ee45596 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 18 Sep 2019 13:55:53 +0300 Subject: [PATCH 070/102] Update CHANGELOG.md --- CHANGELOG.md | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a07fa7507b1..a229b3c9639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,290 @@ +## ClickHouse release 19.14.3.3, 2019-09-10 + +### New Feature +* Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) +* WITH TIES modifier for LIMIT and WITH FILL modifier for ORDER BY. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +* Implement support for INSERT-query with Kafka tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) +* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Ability to read from VALUES list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES ('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. Fixed error while parsing of columns list from string if type contained a comma [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) +* Added an ability to alter storage settings. Syntax: `ALTER TABLE MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) +* Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/yandex/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) +* Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/yandex/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/yandex/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to insert into recursive materialized view. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) +* New `system.metric_log` table which stores values of ProfileEvents and CurrentMetrics within specified interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Input format ORC. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) +* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. Added two new functions: `sigmoid` and `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) +* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) +* Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/yandex/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) +* New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/yandex/ClickHouse/issues/5885). [#6477](https://github.com/yandex/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) +* Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/yandex/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) +* Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) +* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/yandex/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/yandex/ClickHouse/issues/6220) [#6343](https://github.com/yandex/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) +* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/yandex/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) +* Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) +* Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) +* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) +* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/yandex/ClickHouse/issues/6182), [#6252](https://github.com/yandex/ClickHouse/issues/6252) [#6352](https://github.com/yandex/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) + +### Experimental features +* Input and output data format `Template`. Template format allows specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) +* Implementation of LIVE VIEW tables that were originally proposed in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). Note that currently only LIVE VIEW tables are supported. See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +### Bug Fix +* Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/yandex/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) +* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/yandex/ClickHouse/issues/6136) [#6146](https://github.com/yandex/ClickHouse/issues/6146) [#6156](https://github.com/yandex/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) +* Fix column TTL with user defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user defaults. [#6796](https://github.com/yandex/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) +* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/yandex/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) +* Fix bug introduced in query profiler which leads to endless recv from socket. [#6386](https://github.com/yandex/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) +* Removed extra verbose logging from MySQL handler [#6389](https://github.com/yandex/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Return ability to parse boolean settings from 'true' and 'false'. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +* Fix crash in `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/yandex/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/yandex/ClickHouse/issues/6248) [#6374](https://github.com/yandex/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) +* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [#6205](https://github.com/yandex/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) +* Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [#6208](https://github.com/yandex/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) +* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/yandex/ClickHouse/issues/6502) [#6617](https://github.com/yandex/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) +* Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [#6249](https://github.com/yandex/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/yandex/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix for data race in StorageMerge [#6717](https://github.com/yandex/ClickHouse/pull/6717) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [#6594](https://github.com/yandex/ClickHouse/issues/6594) [#6713](https://github.com/yandex/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) +* Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [#6746](https://github.com/yandex/ClickHouse/issues/6746) [#6780](https://github.com/yandex/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) +* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/yandex/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/yandex/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) +* Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [#6406](https://github.com/yandex/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/yandex/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/yandex/ClickHouse/issues/6401) [#6409](https://github.com/yandex/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/yandex/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) +* Fixed memory leak in `bitmapSubsetInRange` function. [#6819](https://github.com/yandex/ClickHouse/pull/6819) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fix rare bug when mutation executed after granularity change. [#6816](https://github.com/yandex/ClickHouse/pull/6816) ([alesapin](https://github.com/alesapin)) +* Allow protobuf message with all fields by default. [#6132](https://github.com/yandex/ClickHouse/pull/6132) ([Vitaly Baranov](https://github.com/vitlibar)) +* Resolve a bug with `nullIf` function when we send a `NULL` argument on the second argument. [#6446](https://github.com/yandex/ClickHouse/pull/6446) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [#6447](https://github.com/yandex/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) +* Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [#6398](https://github.com/yandex/ClickHouse/issues/6398) [#6444](https://github.com/yandex/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) +* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/yandex/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to use not nullable types in JOINs with ```join_use_nulls``` enabled. [#6705](https://github.com/yandex/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) +* Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [#6706](https://github.com/yandex/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/yandex/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/yandex/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Avoid deadlock in `REPLACE PARTITION`. [#6677](https://github.com/yandex/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` subqueries. [#6381](https://github.com/yandex/ClickHouse/issues/6381) [#6676](https://github.com/yandex/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) +* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/yandex/ClickHouse/pull/6779) ([proller](https://github.com/proller)) +* Fixed deserialization in `DataTypeAggregateFunction`. [6575](https://github.com/yandex/ClickHouse/issues/6575) [#6773](https://github.com/yandex/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) +* Using `arrayReduce` for constant arguments may lead to segfault. [#6242](https://github.com/yandex/ClickHouse/issues/6242) [#6326](https://github.com/yandex/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [#6522](https://github.com/yandex/ClickHouse/issues/6522) [#6523](https://github.com/yandex/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/yandex/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) +* Fix crash when casting types to Decimal that do not support it. Throw exception instead. [#6297](https://github.com/yandex/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) +* Security issue. If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed hang in `JSONExtractRaw` function. [#6195](https://github.com/yandex/ClickHouse/issues/6195) [#6198](https://github.com/yandex/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +* Fixes the regression while pushing to materialized view. [#6415](https://github.com/yandex/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) +* Fix `CSV` parser. [#6426](https://github.com/yandex/ClickHouse/issues/6426) [#6559](https://github.com/yandex/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) +* Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [#6045](https://github.com/yandex/ClickHouse/issues/6045) [#6413](https://github.com/yandex/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [#6594](https://github.com/yandex/ClickHouse/issues/6594). [#6748](https://github.com/yandex/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) +* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/yandex/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) +* Improve error handling in cache dictionaries. [#6737](https://github.com/yandex/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) +* Parquet: Fix reading boolean columns. [#6579](https://github.com/yandex/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) +* Fix initialization order while server startup. Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/yandex/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) +* Fixed crash in `extractAll()` function. [#6644](https://github.com/yandex/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed wrong behaviour of `trim` functions family. [#6647](https://github.com/yandex/ClickHouse/pull/6647) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Clearing the data buffer from the previous read operation that was completed with an error. [#6026](https://github.com/yandex/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) +* Fix bug with enabling adaptive granularity when creating new replica for Replicated*MergeTree table. [#6394](https://github.com/yandex/ClickHouse/issues/6394) [#6452](https://github.com/yandex/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) +* Fixed possible crash during server startup in case of exception happened in `libunwind` during exception at access to uninitialised `ThreadStatus` structure. [#6456](https://github.com/yandex/ClickHouse/pull/6456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Fixed data race in `system.parts` table and `ALTER` query. [#6245](https://github.com/yandex/ClickHouse/issues/6245). [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix crash in `yandexConsistentHash` function. Found by fuzz test. [#6304](https://github.com/yandex/ClickHouse/issues/6304) [#6305](https://github.com/yandex/ClickHouse/pull/6305) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the possibility of hanging queries when server is overloaded and global thread pool becomes near full. This have higher chance to happen on clusters with large number of shards (hundreds), because distributed queries allocate a thread per connection to each shard. For example, this issue may reproduce if a cluster of 330 shards is processing 30 concurrent distributed queries. This issue affects all versions starting from 19.2. [#6301](https://github.com/yandex/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/yandex/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix segfault when decoding symbol table. [#6603](https://github.com/yandex/ClickHouse/pull/6603) ([Amos Bird](https://github.com/amosbird)) +* Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/yandex/ClickHouse/pull/6167) ([Lixiang Qian](https://github.com/fancyqlx)) +* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/yandex/ClickHouse/issues/6094) [#6119](https://github.com/yandex/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Removed extra quoting of description in `system.settings` table. [#6696](https://github.com/yandex/ClickHouse/issues/6696) [#6699](https://github.com/yandex/ClickHouse/pull/6699) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid possible deadlock in `TRUNCATE` of Replicated table. [#6695](https://github.com/yandex/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix case with same column names in `GLOBAL JOIN ON` section. [#6181](https://github.com/yandex/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix reading in order of sorting key. [#6189](https://github.com/yandex/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) +* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/yandex/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/yandex/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. This issue was found by [Nikita Tikhomirov](https://github.com/NSTikhomirov). [#6466](https://github.com/yandex/ClickHouse/pull/6466) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug opened by [#4405](https://github.com/yandex/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/yandex/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) +* Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/yandex/ClickHouse/issues/6214) [#6233](https://github.com/yandex/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/yandex/ClickHouse/issues/5547) [#6232](https://github.com/yandex/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix TSan crash in `LiveView` no users thread. [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) +* Fix useless `AST` check in Set index. [#6510](https://github.com/yandex/ClickHouse/issues/6510) [#6651](https://github.com/yandex/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [#448](https://github.com/yandex/ClickHouse/issues/448) [#3484](https://github.com/yandex/ClickHouse/issues/3484) [#3450](https://github.com/yandex/ClickHouse/issues/3450) [#2878](https://github.com/yandex/ClickHouse/issues/2878) [#2285](https://github.com/yandex/ClickHouse/issues/2285) [#3796](https://github.com/yandex/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [#6316](https://github.com/yandex/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/yandex/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed typo. [#6631](https://github.com/yandex/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) +* Typo in the error message ( is -> are ). [#6839](https://github.com/yandex/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) + +### Security Fix +* Fix two vulnerabilities in Codecs in decompression phase. [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) + +### Improvement +* Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/yandex/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) +* Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/yandex/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/yandex/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) +* Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [#5053](https://github.com/yandex/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) +* Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [#6754](https://github.com/yandex/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [#6250](https://github.com/yandex/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [#6255](https://github.com/yandex/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Consider unquoted `NULL` literal as `\N` (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) +* When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [#6521](https://github.com/yandex/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) +* Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [#6285](https://github.com/yandex/ClickHouse/issues/6285) [#6320](https://github.com/yandex/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [#4449](https://github.com/yandex/ClickHouse/issues/4449) [#6730](https://github.com/yandex/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Support push down predicate for final subquery. [#6120](https://github.com/yandex/ClickHouse/pull/6120) ([TCeason](https://github.com/TCeason)) [#6162](https://github.com/yandex/ClickHouse/pull/6162) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Better `JOIN ON` keys extraction [#6131](https://github.com/yandex/ClickHouse/pull/6131) ([Artem Zuikov](https://github.com/4ertus2)) +* Upated `SIMDJSON`. [#6285](https://github.com/yandex/ClickHouse/issues/6285). [#6306](https://github.com/yandex/ClickHouse/pull/6306) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Optimize selecting of smallest column for `SELECT count()` query. [#6344](https://github.com/yandex/ClickHouse/pull/6344) ([Amos Bird](https://github.com/amosbird)) +* Added `strict` parameter in `windowFunnel()`. When the `strict` is set, the `windowFunnel()` applies conditions only for the unique values. [#6548](https://github.com/yandex/ClickHouse/pull/6548) ([achimbab](https://github.com/achimbab)) +* Safer interface of `mysqlxx::Pool`. [#6150](https://github.com/yandex/ClickHouse/pull/6150) ([avasiliev](https://github.com/avasiliev)) +* Options line size when executing with `--help` option now corresponds with terminal size. [#6590](https://github.com/yandex/ClickHouse/pull/6590) ([dimarub2000](https://github.com/dimarub2000)) +* Disable "read in order" optimization for aggregation without keys. [#6599](https://github.com/yandex/ClickHouse/pull/6599) ([Anton Popov](https://github.com/CurtizJ)) +* HTTP status code for `INCORRECT_DATA` and `TYPE_MISMATCH` error codes was changed from default `500 Internal Server Error` to `400 Bad Request`. [#6271](https://github.com/yandex/ClickHouse/pull/6271) ([Alexander Rodin](https://github.com/a-rodin)) +* Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [#6801](https://github.com/yandex/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [#6759](https://github.com/yandex/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [#6757](https://github.com/yandex/ClickHouse/issues/6757). [#6756](https://github.com/yandex/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2*DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/yandex/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) +* Move AST alias interpreting logic out of parser that doesn't have to know anything about query semantics. [#6108](https://github.com/yandex/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) +* Slightly more safe parsing of `NamesAndTypesList`. [#6408](https://github.com/yandex/ClickHouse/issues/6408). [#6410](https://github.com/yandex/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* clickhouse-copier: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/yandex/ClickHouse/pull/6577) ([proller](https://github.com/proller)) +* Added optional message argument in `throwIf`. ([#5772](https://github.com/yandex/ClickHouse/issues/5772)) [#6329](https://github.com/yandex/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) +* Server exception got while sending insertion data by is now being processed in client as well. [#5891](https://github.com/yandex/ClickHouse/issues/5891) [#6711](https://github.com/yandex/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) +* Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [#6600](https://github.com/yandex/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [#6785](https://github.com/yandex/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) 'lock-order-inversion'. [#6740](https://github.com/yandex/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) +* Better information messages about lack of Linux capabilities. Logging fatal errors with "fatal" level, that will make it easier to find in `system.text_log`. [#6441](https://github.com/yandex/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/yandex/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) +* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY/SORT`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/yandex/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/yandex/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false "Attempt to acquire exclusive lock recursively" messages. [#6771](https://github.com/yandex/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [#6524](https://github.com/yandex/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) +* Added `mysql_native_password` authentication plugin to MySQL compatibility server. [#6194](https://github.com/yandex/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) +* Less number of `clock_gettime` calls; fixed ABI compatibility between debug/release in `Allocator` (insignificant issue). [#6197](https://github.com/yandex/ClickHouse/pull/6197) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Move `collectUsedColumns` from `ExpressionAnalyzer` to `SyntaxAnalyzer`. `SyntaxAnalyzer` makes `required_source_columns` itself now. [#6416](https://github.com/yandex/ClickHouse/pull/6416) ([Artem Zuikov](https://github.com/4ertus2)) +* Add setting `joined_subquery_requires_alias` to require aliases for subselects and table functions in `FROM` that more than one table is present (i.e. queries with JOINs). [#6733](https://github.com/yandex/ClickHouse/pull/6733) ([Artem Zuikov](https://github.com/4ertus2)) +* Extract `GetAggregatesVisitor` class from `ExpressionAnalyzer`. [#6458](https://github.com/yandex/ClickHouse/pull/6458) ([Artem Zuikov](https://github.com/4ertus2)) +* `system.query_log`: change data type of `type` column to `Enum`. [#6265](https://github.com/yandex/ClickHouse/pull/6265) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Static linking of `sha256_password` authentication plugin. [#6512](https://github.com/yandex/ClickHouse/pull/6512) ([Yuriy Baranov](https://github.com/yurriy)) +* Avoid extra dependency for the setting `compile` to work. In previous versions, the user may get error like `cannot open crti.o`, `unable to find library -lc` etc. [#6309](https://github.com/yandex/ClickHouse/pull/6309) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* More validation of the input that may come from malicious replica. [#6303](https://github.com/yandex/ClickHouse/pull/6303) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now `clickhouse-obfuscator` file is available in `clickhouse-client` package. In previous versions it was available as `clickhouse obfuscator` (with whitespace). [#5816](https://github.com/yandex/ClickHouse/issues/5816) [#6609](https://github.com/yandex/ClickHouse/pull/6609) ([dimarub2000](https://github.com/dimarub2000)) +* Fixed deadlock when we have at least two queries that read at least two tables in different order and another query that performs DDL operation on one of tables. Fixed another very rare deadlock. [#6764](https://github.com/yandex/ClickHouse/pull/6764) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added `os_thread_ids` column to `system.processes` and `system.query_log` for better debugging possibilities. [#6763](https://github.com/yandex/ClickHouse/pull/6763) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* A workaround for PHP mysqlnd extension bugs which occur when `sha256_password` is used as a default authentication plugin (described in [#6031](https://github.com/yandex/ClickHouse/issues/6031)). [#6113](https://github.com/yandex/ClickHouse/pull/6113) ([Yuriy Baranov](https://github.com/yurriy)) +* Remove unneeded place with changed nullability columns. [#6693](https://github.com/yandex/ClickHouse/pull/6693) ([Artem Zuikov](https://github.com/4ertus2)) +* Set default value of `queue_max_wait_ms` to zero, because current value (five seconds) makes no sense. There are rare circumstances when this settings has any use. Added settings `replace_running_query_max_wait_ms`, `kafka_max_wait_ms` and `connection_pool_max_wait_ms` for disambiguation. [#6692](https://github.com/yandex/ClickHouse/pull/6692) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Extract `SelectQueryExpressionAnalyzer` from `ExpressionAnalyzer`. Keep the last one for non-select queries. [#6499](https://github.com/yandex/ClickHouse/pull/6499) ([Artem Zuikov](https://github.com/4ertus2)) +* Removed duplicating input and output formats. [#6239](https://github.com/yandex/ClickHouse/pull/6239) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/yandex/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [#6191](https://github.com/yandex/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) +* Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [#6511](https://github.com/yandex/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) +* Improve code quality of `LiveView`. [#6619](https://github.com/yandex/ClickHouse/pull/6619) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Make PairNoInit a simple struct. [#6277](https://github.com/yandex/ClickHouse/pull/6277) ([akuzm](https://github.com/akuzm)) +* Get rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/yandex/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) +* Text log simplification. [#6322](https://github.com/yandex/ClickHouse/pull/6322) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Every function in its own file, part 10. [#6321](https://github.com/yandex/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Metric log rectification. [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/yandex/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) +* Formatting changes for `StringHashMap` PR [#5417](https://github.com/yandex/ClickHouse/issues/5417). [#6700](https://github.com/yandex/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) +* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/yandex/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) +* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/yandex/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) +* Refactoring of settings. [#6689](https://github.com/yandex/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) +* Add Comments for set index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) + +### Performance Improvement +* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with`ORDER` key in `MergeTree` tables. [#6054](https://github.com/yandex/ClickHouse/pull/6054) ([Anton Popov](https://github.com/CurtizJ)) +* Implement 'read in order' optimization with processors. [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) +* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) ([akuzm](https://github.com/akuzm)) +* Disable consecutive key optimization for `UInt8/16` LowCardinality columns. [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) +* Slightly improve performance of `MemoryTracker`. [#6653](https://github.com/yandex/ClickHouse/pull/6653) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/yandex/ClickHouse/issues/6372) [#6074](https://github.com/yandex/ClickHouse/issues/6074) [#6438](https://github.com/yandex/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/yandex/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Pre-fault pages when allocating memory with `mmap()`. [#6667](https://github.com/yandex/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) +* Fix performance bug in `Decimal` comparison. [#6380](https://github.com/yandex/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) + +### Build/Testing/Packaging Improvement +* Best effort for printing stack traces. Also added SIGPROF as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/yandex/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* In debug version on Linux, increase OOM score. [#6152](https://github.com/yandex/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) +* HDFS HA now work in debug build. [#6650](https://github.com/yandex/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) +* Added a test to `transform_query_for_external_database`. [#6388](https://github.com/yandex/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add test for multiple materialized views for Kafka table. [#6509](https://github.com/yandex/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) +* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/yandex/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Make a better build scheme. [#6500](https://github.com/yandex/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) +* Fixed `test_external_dictionaries` integration in case it was executed under non root user. [#6507](https://github.com/yandex/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [#6204](https://github.com/yandex/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) +* Added a test for `RENAME` table race condition [#6752](https://github.com/yandex/ClickHouse/pull/6752) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid data race on Settings in `KILL QUERY`. [#6753](https://github.com/yandex/ClickHouse/pull/6753) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add integration test for handling errors by a cache dictionary. [#6755](https://github.com/yandex/ClickHouse/pull/6755) ([Vitaly Baranov](https://github.com/vitlibar)) +* Move `input_format_defaults_for_omitted_fields` to incompatible changes [#6573](https://github.com/yandex/ClickHouse/pull/6573) ([Artem Zuikov](https://github.com/4ertus2)) +* Disable parsing of ELF object files on Mac OS, because it makes no sense. [#6578](https://github.com/yandex/ClickHouse/pull/6578) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Attempt to make changelog generator better. [#6327](https://github.com/yandex/ClickHouse/pull/6327) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Adding `-Wshadow` switch to the GCC. [#6325](https://github.com/yandex/ClickHouse/pull/6325) ([kreuzerkrieg](https://github.com/kreuzerkrieg)) +* Removed obsolete code for `mimalloc` support. [#6715](https://github.com/yandex/ClickHouse/pull/6715) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `zlib-ng` determines x86 capabilities and saves this info to global variables. This is done in defalteInit call, which may be made by different threads simultaneously. To avoid multithreaded writes, do it on library startup. [#6141](https://github.com/yandex/ClickHouse/pull/6141) ([akuzm](https://github.com/akuzm)) +* Regression test for a bug which in join which was fixed in [#5192](https://github.com/yandex/ClickHouse/issues/5192). [#6147](https://github.com/yandex/ClickHouse/pull/6147) ([Bakhtiyor Ruziev](https://github.com/theruziev)) +* Fixed MSan report. [#6144](https://github.com/yandex/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix flapping TTL test. [#6782](https://github.com/yandex/ClickHouse/pull/6782) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed false data race in `MergeTreeDataPart::is_frozen` field. [#6583](https://github.com/yandex/ClickHouse/pull/6583) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed timeouts in fuzz test. In previous version, it managed to find false hangup in query `SELECT * FROM numbers_mt(gccMurmurHash(''))`. [#6582](https://github.com/yandex/ClickHouse/pull/6582) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added debug checks to `static_cast` of columns. [#6581](https://github.com/yandex/ClickHouse/pull/6581) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Support for Oracle Linux in official RPM packages. [#6356](https://github.com/yandex/ClickHouse/issues/6356) [#6585](https://github.com/yandex/ClickHouse/pull/6585) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Changed json perftests from `once` to `loop` type. [#6536](https://github.com/yandex/ClickHouse/pull/6536) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* `odbc-bridge.cpp` defines `main()` so it should not be included in `clickhouse-lib`. [#6538](https://github.com/yandex/ClickHouse/pull/6538) ([Orivej Desh](https://github.com/orivej)) +* Test for crash in `FULL|RIGHT JOIN` with nulls in right table's keys. [#6362](https://github.com/yandex/ClickHouse/pull/6362) ([Artem Zuikov](https://github.com/4ertus2)) +* Added a test for the limit on expansion of aliases just in case. [#6442](https://github.com/yandex/ClickHouse/pull/6442) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added previous declaration checks for MySQL 8 integration. [#6569](https://github.com/yandex/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) +* Switched from `boost::filesystem` to `std::filesystem` where appropriate. [#6253](https://github.com/yandex/ClickHouse/pull/6253) [#6385](https://github.com/yandex/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added RPM packages to website. [#6251](https://github.com/yandex/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add a test for fixed `Unknown identifier` exception in `IN` section. [#6708](https://github.com/yandex/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) +* Added test for ORC input format. [#6703](https://github.com/yandex/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) +* Simplify `shared_ptr_helper` because people facing difficulties understanding it. [#6675](https://github.com/yandex/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added performance tests for fixed Gorilla and DoubleDelta codec. [#6179](https://github.com/yandex/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) +* Split the integration test `test_dictionaries` into 4 separate tests. [#6776](https://github.com/yandex/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix PVS warning in `PipelineExecutor`. [#6777](https://github.com/yandex/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Allow to use library dictionary source with ASan. [#6482](https://github.com/yandex/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added option to generate changelog from a list of PRs. [#6350](https://github.com/yandex/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Lock the `TinyLog` storage when reading. [#6226](https://github.com/yandex/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) +* Check for broken symlinks in CI. [#6634](https://github.com/yandex/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Increase timeout for "stack overflow" test because it may take a long time in debug build. [#6637](https://github.com/yandex/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/yandex/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added a check for double whitespaces. [#6643](https://github.com/yandex/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `new/delete` memory tracking then build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/yandex/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) +* Enable back the check of undefined symbols while linking. [#6453](https://github.com/yandex/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) +* Avoid rebuilding `hyperscan` each day. [#6307](https://github.com/yandex/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/yandex/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed UBSan report in `ProtobufWriter`. [#6163](https://github.com/yandex/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Don't allow to use query profiler with sanitizers because it is not compatible. [#6769](https://github.com/yandex/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add test for reloading a dictionary after fail by timer. [#6114](https://github.com/yandex/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix inconsistency in `PipelineExecutor::prepareProcessor` argument type. [#6494](https://github.com/yandex/ClickHouse/pull/6494) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Added a test for bad URIs. [#6493](https://github.com/yandex/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [#6346](https://github.com/yandex/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Added `gcc-9` support to `docker/builder` container that builds image locally. [#6333](https://github.com/yandex/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) +* Test for primary index with `lowCardinality(String)`. [#5044](https://github.com/yandex/ClickHouse/issues/5044) [#6219](https://github.com/yandex/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) +* Using Danila Kutenin variant to make fastops working [#6317](https://github.com/yandex/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed tests affected by slow stack traces printing. [#6315](https://github.com/yandex/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add a test case for crash in `groupUniqArray` fixed in [#6029](https://github.com/yandex/ClickHouse/pull/6029). [#4402](https://github.com/yandex/ClickHouse/issues/4402) [#6129](https://github.com/yandex/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) +* Fixed indices mutations tests. [#6645](https://github.com/yandex/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) +* Attempt to fix performance test. [#6392](https://github.com/yandex/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* In performance test, do not read query log for queries we didn't run. [#6427](https://github.com/yandex/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) +* Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [#6428](https://github.com/yandex/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Updated tests for `send_logs_level` setting. [#6207](https://github.com/yandex/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix build under gcc-8.2. [#6196](https://github.com/yandex/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) +* Fix build with internal libc++. [#6724](https://github.com/yandex/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) +* Fix shared build with `rdkafka` library [#6101](https://github.com/yandex/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) +* Fixes for Mac OS build. [#6390](https://github.com/yandex/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/yandex/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) +* Fix splitted build. [#6618](https://github.com/yandex/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Other build fixes: [#6186](https://github.com/yandex/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/yandex/ClickHouse/pull/6486) [#6348](https://github.com/yandex/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/yandex/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/yandex/ClickHouse/pull/6016) [#6421](https://github.com/yandex/ClickHouse/pull/6421) [#6491](https://github.com/yandex/ClickHouse/pull/6491) ([proller](https://github.com/proller)) + +### SQL compatibility +* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) + +### Backward Incompatible Change +* Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/yandex/ClickHouse/issues/5126) [#6351](https://github.com/yandex/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) + ## ClickHouse release 19.13.3.26, 2019-08-22 ### Bug Fix From e1898ca89ac90015b1849b2d65429c7e9e594d76 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 16 Sep 2019 21:19:10 +0300 Subject: [PATCH 071/102] Make the quadratic Arena::allocContinue less bad. Scenarios that use Arena::allocContinue may waste quadratically many memory and perform quadratically many copying, when the memory range size reaches Arena's linear allocation threshold. To alleviate this, make sure that the next memory chunk allocated by allocContinue is at least linear_growth_threshold bytes bigger than the previous one, so that we don't reallocate and copy that often. --- dbms/src/Common/Arena.h | 115 ++++++++++-------- dbms/src/Common/ArenaAllocator.h | 2 +- ...012_serialize_array_memory_usage.reference | 0 .../01012_serialize_array_memory_usage.sql | 3 + 4 files changed, 68 insertions(+), 52 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.reference create mode 100644 dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.sql diff --git a/dbms/src/Common/Arena.h b/dbms/src/Common/Arena.h index fcaee6cb07f..feee0de0f76 100644 --- a/dbms/src/Common/Arena.h +++ b/dbms/src/Common/Arena.h @@ -97,13 +97,23 @@ private: size_t size_after_grow = 0; if (head->size() < linear_growth_threshold) - size_after_grow = head->size() * growth_factor; + { + size_after_grow = std::max(min_next_size, head->size() * growth_factor); + } else - size_after_grow = linear_growth_threshold; - - if (size_after_grow < min_next_size) - size_after_grow = min_next_size; + { + // allocContinue() combined with linear growth results in quadratic + // behavior: we append the data by small amounts, and when it + // doesn't fit, we create a new chunk and copy all the previous data + // into it. The number of times we do this is directly proportional + // to the total size of data that is going to be serialized. To make + // the copying happen less often, round the next size up to the + // linear_growth_threshold. + size_after_grow = ((min_next_size + linear_growth_threshold - 1) + / linear_growth_threshold) * linear_growth_threshold; + } + assert(size_after_grow >= min_next_size); return roundUpToPageSize(size_after_grow); } @@ -180,65 +190,68 @@ public: return head->pos; } - /** Begin or expand allocation of contiguous piece of memory without alignment. - * 'begin' - current begin of piece of memory, if it need to be expanded, or nullptr, if it need to be started. - * If there is no space in chunk to expand current piece of memory - then copy all piece to new chunk and change value of 'begin'. - * NOTE This method is usable only for latest allocation. For earlier allocations, see 'realloc' method. + /** Begin or expand a contiguous range of memory. + * 'range_start' is the start of range. If nullptr, a new range is + * allocated. + * If there is no space in the current chunk to expand the range, + * the entire range is copied to a new, bigger memory chunk, and the value + * of 'range_start' is updated. + * If the optional 'start_alignment' is specified, the start of range is + * kept aligned to this value. + * + * NOTE This method is usable only for the last allocation made on this + * Arena. For earlier allocations, see 'realloc' method. */ - char * allocContinue(size_t size, char const *& begin) + char * allocContinue(size_t additional_bytes, char const *& range_start, + size_t start_alignment = 0) { - while (unlikely(head->pos + size > head->end)) + if (!range_start) { - char * prev_end = head->pos; - addChunk(size); + // Start a new memory range. + char * result = start_alignment + ? alignedAlloc(additional_bytes, start_alignment) + : alloc(additional_bytes); - if (begin) - begin = insert(begin, prev_end - begin); - else - break; + range_start = result; + return result; } - char * res = head->pos; - head->pos += size; + // Extend an existing memory range with 'additional_bytes'. - if (!begin) - begin = res; + // This method only works for extending the last allocation. For lack of + // original size, check a weaker condition: that 'begin' is at least in + // the current Chunk. + assert(range_start >= head->begin && range_start < head->end); - ASAN_UNPOISON_MEMORY_REGION(res, size + pad_right); - return res; - } - - char * alignedAllocContinue(size_t size, char const *& begin, size_t alignment) - { - char * res; - - do + if (head->pos + additional_bytes <= head->end) { - void * head_pos = head->pos; - size_t space = head->end - head->pos; + // The new size fits into the last chunk, so just alloc the + // additional size. We can alloc without alignment here, because it + // only applies to the start of the range, and we don't change it. + return alloc(additional_bytes); + } - res = static_cast(std::align(alignment, size, head_pos, space)); - if (res) - { - head->pos = static_cast(head_pos); - head->pos += size; - break; - } + // New range doesn't fit into this chunk, will copy to a new one. + // + // Note: among other things, this method is used to provide a hack-ish + // implementation of realloc over Arenas in ArenaAllocators. It wastes a + // lot of memory -- quadratically so when we reach the linear allocation + // threshold. This deficiency is intentionally left as is, and should be + // solved not by complicating this method, but by rethinking the + // approach to memory management for aggregate function states, so that + // we can provide a proper realloc(). + const size_t existing_bytes = head->pos - range_start; + const size_t new_bytes = existing_bytes + additional_bytes; + const char * old_range = range_start; - char * prev_end = head->pos; - addChunk(size + alignment); + char * new_range = start_alignment + ? alignedAlloc(new_bytes, start_alignment) + : alloc(new_bytes); - if (begin) - begin = alignedInsert(begin, prev_end - begin, alignment); - else - break; - } while (true); + memcpy(new_range, old_range, existing_bytes); - if (!begin) - begin = res; - - ASAN_UNPOISON_MEMORY_REGION(res, size + pad_right); - return res; + range_start = new_range; + return new_range + existing_bytes; } /// NOTE Old memory region is wasted. diff --git a/dbms/src/Common/ArenaAllocator.h b/dbms/src/Common/ArenaAllocator.h index f8ce8f38921..6eb415d5e54 100644 --- a/dbms/src/Common/ArenaAllocator.h +++ b/dbms/src/Common/ArenaAllocator.h @@ -54,7 +54,7 @@ public: if (data + old_size == arena->head->pos) { - arena->alignedAllocContinue(new_size - old_size, data, alignment); + arena->allocContinue(new_size - old_size, data, alignment); return reinterpret_cast(const_cast(data)); } else diff --git a/dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.reference b/dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.sql b/dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.sql new file mode 100644 index 00000000000..3f266dba225 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01012_serialize_array_memory_usage.sql @@ -0,0 +1,3 @@ +-- serialization of big arrays shouldn't use too much memory +set max_memory_usage = 3000000000; +select ignore(x) from (select groupArray(number) x from numbers(33554433)) group by x format Null; From 0cf5bfdf44a17e2ad3371ad0d8e903ad95920a7d Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 18 Sep 2019 15:24:35 +0300 Subject: [PATCH 072/102] fix typo --- dbms/src/Interpreters/AnalyzedJoin.h | 4 ++-- dbms/src/Interpreters/Join.cpp | 4 ++-- dbms/src/Interpreters/MergeJoin.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index b04d55490e4..10075ec2792 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -76,8 +76,8 @@ public: ASTTableJoin::Strictness strictness() const { return table_join.strictness; } const SizeLimits & sizeLimits() const { return size_limits; } - bool forceNullabelRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } - bool forceNullabelLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } + bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } + bool forceNullableLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } void addUsingKey(const ASTPtr & ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index a0eb8da3ce9..fa77d57ac77 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -66,8 +66,8 @@ Join::Join(std::shared_ptr table_join_, const Block & right_sample , strictness(table_join->strictness()) , key_names_right(table_join->keyNamesRight()) , required_right_keys(table_join->requiredRightKeys()) - , nullable_right_side(table_join->forceNullabelRight()) - , nullable_left_side(table_join->forceNullabelLeft()) + , nullable_right_side(table_join->forceNullableRight()) + , nullable_left_side(table_join->forceNullableLeft()) , any_take_last_row(any_take_last_row_) , log(&Logger::get("Join")) { diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 93a8cf2e4c5..392c60272fd 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -202,7 +202,7 @@ void joinInequalsLeft(const Block & left_block, MutableColumns & left_columns, M MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & right_sample_block) : table_join(table_join_) - , nullable_right_side(table_join->forceNullabelRight()) + , nullable_right_side(table_join->forceNullableRight()) , is_all(table_join->strictness() == ASTTableJoin::Strictness::All) , is_inner(isInner(table_join->kind())) , is_left(isLeft(table_join->kind())) From e3a9863260178302afcfad6d4b3b324034d1d7d9 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 18 Sep 2019 15:27:03 +0300 Subject: [PATCH 073/102] Use MAP_POPULATE only on Linux. --- dbms/src/Common/Allocator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h index ad5b0318c91..61ac9f44540 100644 --- a/dbms/src/Common/Allocator.h +++ b/dbms/src/Common/Allocator.h @@ -178,8 +178,13 @@ protected: // hash tables, it makes sense to pre-fault the pages by passing // MAP_POPULATE to mmap(). This takes some time, but should be faster // overall than having a hot loop interrupted by page faults. + // It is only supported on Linux. +#if defined(__linux__) static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | (mmap_populate ? MAP_POPULATE : 0); +#else + static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS; +#endif private: void * allocNoTrack(size_t size, size_t alignment) From dd46c08e90c46b287e7491c3d293e56e7a4c730e Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 18 Sep 2019 15:46:57 +0300 Subject: [PATCH 074/102] move JoinCommon funcs out of IJoin.h/cpp --- dbms/src/Interpreters/IJoin.h | 23 ------------- dbms/src/Interpreters/Join.cpp | 3 ++ dbms/src/Interpreters/MergeJoin.cpp | 1 + .../{IJoin.cpp => join_common.cpp} | 2 +- dbms/src/Interpreters/join_common.h | 32 +++++++++++++++++++ 5 files changed, 37 insertions(+), 24 deletions(-) rename dbms/src/Interpreters/{IJoin.cpp => join_common.cpp} (99%) create mode 100644 dbms/src/Interpreters/join_common.h diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index 5e16e25c58e..9773f7ae979 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -10,10 +10,7 @@ namespace DB { -struct ColumnWithTypeAndName; class Block; -class IColumn; -using ColumnRawPtrs = std::vector; class IJoin { @@ -39,24 +36,4 @@ public: using JoinPtr = std::shared_ptr; - -namespace JoinCommon -{ - -void convertColumnToNullable(ColumnWithTypeAndName & column); -void convertColumnsToNullable(Block & block, size_t starting_pos = 0); -ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & names, Columns & materialized); -void removeLowCardinalityInplace(Block & block); - -/// Split key and other columns by keys name list -ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, - Block & sample_block_with_keys, Block & sample_block_with_columns_to_add); - -/// Throw an exception if blocks have different types of key columns. Compare up to Nullability. -void checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right, const Names & key_names_right); - -void createMissedColumns(Block & block); - -} - } diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index fa77d57ac77..aa585834412 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,9 @@ size_t Join::getTotalByteCount() const void Join::setSampleBlock(const Block & block) { + /// You have to restore this lock if you call the fuction outside of ctor. //std::unique_lock lock(rwlock); + LOG_DEBUG(log, "setSampleBlock: " << block.dumpStructure()); if (!empty()) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 392c60272fd..20cb70f7f1c 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include diff --git a/dbms/src/Interpreters/IJoin.cpp b/dbms/src/Interpreters/join_common.cpp similarity index 99% rename from dbms/src/Interpreters/IJoin.cpp rename to dbms/src/Interpreters/join_common.cpp index 9e4ded0fb90..4dea22cb453 100644 --- a/dbms/src/Interpreters/IJoin.cpp +++ b/dbms/src/Interpreters/join_common.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/dbms/src/Interpreters/join_common.h b/dbms/src/Interpreters/join_common.h new file mode 100644 index 00000000000..d75a25563ad --- /dev/null +++ b/dbms/src/Interpreters/join_common.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +struct ColumnWithTypeAndName; +class Block; +class IColumn; +using ColumnRawPtrs = std::vector; + +namespace JoinCommon +{ + +void convertColumnToNullable(ColumnWithTypeAndName & column); +void convertColumnsToNullable(Block & block, size_t starting_pos = 0); +ColumnRawPtrs temporaryMaterializeColumns(const Block & block, const Names & names, Columns & materialized); +void removeLowCardinalityInplace(Block & block); + +/// Split key and other columns by keys name list +ColumnRawPtrs extractKeysForJoin(const Names & key_names_right, const Block & right_sample_block, + Block & sample_block_with_keys, Block & sample_block_with_columns_to_add); + +/// Throw an exception if blocks have different types of key columns. Compare up to Nullability. +void checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right, const Names & key_names_right); + +void createMissedColumns(Block & block); + +} + +} From aa8ef059556ffe89fe7fae84f8763684d18848ba Mon Sep 17 00:00:00 2001 From: Akazz Date: Wed, 18 Sep 2019 16:14:56 +0300 Subject: [PATCH 075/102] Reworked test 00715_fetch_merged_or_mutated_part_zookeeper --- ..._fetch_merged_or_mutated_part_zookeeper.sh | 52 +++++++++++++++++++ ...fetch_merged_or_mutated_part_zookeeper.sql | 42 --------------- 2 files changed, 52 insertions(+), 42 deletions(-) create mode 100755 dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh delete mode 100644 dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sql diff --git a/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh b/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh new file mode 100755 index 00000000000..a7cb79908ae --- /dev/null +++ b/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh +. $CURDIR/mergetree_mutations.lib + + +${CLICKHOUSE_CLIENT} -n --query=" + DROP TABLE IF EXISTS fetches_r1; + DROP TABLE IF EXISTS fetches_r2" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE fetches_r1(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/test/fetches', 'r1') ORDER BY x" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE fetches_r2(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/test/fetches', 'r2') ORDER BY x \ + SETTINGS prefer_fetch_merged_part_time_threshold=0, \ + prefer_fetch_merged_part_size_threshold=0" + +${CLICKHOUSE_CLIENT} -n --query=" + INSERT INTO fetches_r1 VALUES (1); + INSERT INTO fetches_r1 VALUES (2); + INSERT INTO fetches_r1 VALUES (3)" + +${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA fetches_r2" +${CLICKHOUSE_CLIENT} --query="DETACH TABLE fetches_r2" + +${CLICKHOUSE_CLIENT} --query="OPTIMIZE TABLE fetches_r1 PARTITION tuple() FINAL" --replication_alter_partitions_sync=0 +${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA fetches_r1" + +# After attach replica r2 should fetch the merged part from r1. +${CLICKHOUSE_CLIENT} --query="ATTACH TABLE fetches_r2" +${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA fetches_r2" + +${CLICKHOUSE_CLIENT} --query="SELECT '*** Check data after fetch of merged part ***'" +${CLICKHOUSE_CLIENT} --query="SELECT _part, * FROM fetches_r2 ORDER BY x" + +${CLICKHOUSE_CLIENT} --query="DETACH TABLE fetches_r2" + +# Add mutation that doesn't change data. +${CLICKHOUSE_CLIENT} --query="ALTER TABLE fetches_r1 DELETE WHERE x = 0" --replication_alter_partitions_sync=0 + +wait_for_mutation "fetches_r1" "0000000000" +${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA fetches_r1" + +# After attach replica r2 should compare checksums for mutated part and clone the local part. +${CLICKHOUSE_CLIENT} --query="ATTACH TABLE fetches_r2" +${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA fetches_r2" + +${CLICKHOUSE_CLIENT} --query="SELECT '*** Check data after fetch/clone of mutated part ***'" +${CLICKHOUSE_CLIENT} --query="SELECT _part, * FROM fetches_r2 ORDER BY x" + +${CLICKHOUSE_CLIENT} -n --query=" + DROP TABLE fetches_r1; + DROP TABLE fetches_r2" diff --git a/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sql b/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sql deleted file mode 100644 index 9a1c1b77cae..00000000000 --- a/dbms/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sql +++ /dev/null @@ -1,42 +0,0 @@ -DROP TABLE IF EXISTS fetches_r1; -DROP TABLE IF EXISTS fetches_r2; - -CREATE TABLE fetches_r1(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/test/fetches', 'r1') ORDER BY x; -CREATE TABLE fetches_r2(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/test/fetches', 'r2') ORDER BY x - SETTINGS prefer_fetch_merged_part_time_threshold=0, - prefer_fetch_merged_part_size_threshold=0; - -INSERT INTO fetches_r1 VALUES (1); -INSERT INTO fetches_r1 VALUES (2); -INSERT INTO fetches_r1 VALUES (3); - -SYSTEM SYNC REPLICA fetches_r2; - -DETACH TABLE fetches_r2; - -SET replication_alter_partitions_sync=0; -OPTIMIZE TABLE fetches_r1 PARTITION tuple() FINAL; -SYSTEM SYNC REPLICA fetches_r1; - --- After attach replica r2 should fetch the merged part from r1. -ATTACH TABLE fetches_r2; -SYSTEM SYNC REPLICA fetches_r2; - -SELECT '*** Check data after fetch of merged part ***'; -SELECT _part, * FROM fetches_r2 ORDER BY x; - -DETACH TABLE fetches_r2; - --- Add mutation that doesn't change data. -ALTER TABLE fetches_r1 DELETE WHERE x = 0; -SYSTEM SYNC REPLICA fetches_r1; - --- After attach replica r2 should compare checksums for mutated part and clone the local part. -ATTACH TABLE fetches_r2; -SYSTEM SYNC REPLICA fetches_r2; - -SELECT '*** Check data after fetch/clone of mutated part ***'; -SELECT _part, * FROM fetches_r2 ORDER BY x; - -DROP TABLE fetches_r1; -DROP TABLE fetches_r2; From edb43371a01649288e59851fa74fa4349599f89b Mon Sep 17 00:00:00 2001 From: Stepan Herold Date: Wed, 18 Sep 2019 16:09:45 +0200 Subject: [PATCH 076/102] Updating max_threads default value in docs. --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index fce10da8f9b..46b3e41d7d9 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -473,7 +473,7 @@ The maximum number of query processing threads, excluding threads for retrieving This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least 'max_threads' number of threads, then 'max_threads' are used. -Default value: 2. +Default value: half the number of physical CPU cores. If less than one SELECT query is normally run on a server at a time, set this parameter to a value slightly less than the actual number of processor cores. From 194e49e751c7244e8e9b54b09879a746b2d704d5 Mon Sep 17 00:00:00 2001 From: chertus Date: Wed, 18 Sep 2019 19:46:00 +0300 Subject: [PATCH 077/102] fix UB in MergeJoinCursor --- dbms/src/Interpreters/MergeJoin.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 20cb70f7f1c..9cc02d9be7a 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -56,7 +56,7 @@ public: bool sameNext(size_t lhs_pos) const { - if (impl.isLast()) + if (lhs_pos + 1 >= impl.rows) return false; for (size_t i = 0; i < impl.sort_columns_size; ++i) @@ -71,10 +71,8 @@ public: return 0; size_t pos = impl.pos; - for (; pos < impl.rows; ++pos) - if (!sameNext(pos)) - break; - + while (sameNext(pos)) + ++pos; return pos - impl.pos + 1; } From 6cb5d0046beb37c2146309fda5b022579dd4893e Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 17 Sep 2019 20:08:03 +0300 Subject: [PATCH 078/102] Add missing linking with jemalloc for clickhouse_common_io clickhouse_common_io incudes new_delete.cpp, that uses memory.h, which uses sdallocx (jemalloc). And since there is -Wl,--no-undefined every undefined symbols are not allowed, hence clickhouse_common_io must know about sdallocx symbol. For the default build (-DUNBUNDLED=OFF) everything is good, because jemalloc is static, and clickhouse_common_io linked with libcommon (which is linked with jemalloc) But if jemalloc will be shared, and clickhouse_common_io and libcommon is different shared libraries then clickhouse_common_io should be linked with jemalloc, otherwise you will undefined reference to sdallocx error. This can be reproduced using the following build configuration: -DUSE_STATIC_LIBRARIES=OFF -DCLICKHOUSE_SPLIT_BINARY=ON -DSPLIT_SHARED_LIBRARIES=ON -DUNBUNDLED=ON Provided that you have systemd-wide jemalloc>=4 (see memory.h). Refs: https://github.com/yandex/ClickHouse/pull/6878#discussion_r324902295 v2: do not link jemalloc if it is static --- dbms/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 2c8f99b9738..2add5576b11 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -414,6 +414,16 @@ endif() if (USE_JEMALLOC) dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # used in Interpreters/AsynchronousMetrics.cpp target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # new_delete.cpp + # common/memory.h + if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) + # skip if we have bundled build, since jemalloc is static in this case + elseif (${JEMALLOC_LIBRARIES} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") + # if the library is static we do not need to link with it, + # since in this case it will be in libs/libcommon, + # and we do not want to link with jemalloc multiple times. + else() + target_link_libraries(clickhouse_common_io PRIVATE ${JEMALLOC_LIBRARIES}) + endif() endif () dbms_target_include_directories (PUBLIC ${DBMS_INCLUDE_DIR} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src/Formats/include) From 6c87f2340a09eedc5850c9a4f8188a3e8fa43ca2 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:13:26 +0300 Subject: [PATCH 079/102] Update CHANGELOG.md --- CHANGELOG.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a229b3c9639..f7ac2c41ac6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,7 @@ * Fix two vulnerabilities in Codecs in decompression phase. [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) ### Improvement +* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) * Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/yandex/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) * Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/yandex/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/yandex/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) @@ -279,12 +280,10 @@ * Fix splitted build. [#6618](https://github.com/yandex/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Other build fixes: [#6186](https://github.com/yandex/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/yandex/ClickHouse/pull/6486) [#6348](https://github.com/yandex/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/yandex/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/yandex/ClickHouse/pull/6016) [#6421](https://github.com/yandex/ClickHouse/pull/6421) [#6491](https://github.com/yandex/ClickHouse/pull/6491) ([proller](https://github.com/proller)) -### SQL compatibility -* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) - ### Backward Incompatible Change * Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/yandex/ClickHouse/issues/5126) [#6351](https://github.com/yandex/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) + ## ClickHouse release 19.13.3.26, 2019-08-22 ### Bug Fix From dfa9b7fa86f13f6ef7bf90944d7bf3e0b3e838b3 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:20:15 +0300 Subject: [PATCH 080/102] Update CHANGELOG.md --- CHANGELOG.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7ac2c41ac6..337e19fa446 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,17 @@ ## ClickHouse release 19.14.3.3, 2019-09-10 ### New Feature +* `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +* `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) * Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) -* WITH TIES modifier for LIMIT and WITH FILL modifier for ORDER BY. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* Implement support for INSERT-query with Kafka tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) +* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) +* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) * Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Ability to read from VALUES list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES ('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. Fixed error while parsing of columns list from string if type contained a comma [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) +* Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) * Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) * Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/yandex/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) * Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/yandex/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/yandex/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to insert into recursive materialized view. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) -* New `system.metric_log` table which stores values of ProfileEvents and CurrentMetrics within specified interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) * Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Input format ORC. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) * Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. Added two new functions: `sigmoid` and `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) @@ -118,6 +119,7 @@ * Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/yandex/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Fixed typo. [#6631](https://github.com/yandex/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) * Typo in the error message ( is -> are ). [#6839](https://github.com/yandex/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) +* Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) ### Security Fix * Fix two vulnerabilities in Codecs in decompression phase. [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) From 144803d3080ec5508864db6c0555a9312055dd70 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:23:31 +0300 Subject: [PATCH 081/102] Update CHANGELOG.md --- CHANGELOG.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 337e19fa446..46b908c7369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,18 +11,16 @@ * Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) * Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/yandex/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) * Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/yandex/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/yandex/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to insert into recursive materialized view. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) +* Suppport for cascaded materialized views. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) * Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Input format ORC. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) +* Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) * Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. Added two new functions: `sigmoid` and `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) -* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) +* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) * Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/yandex/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) * New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/yandex/ClickHouse/issues/5885). [#6477](https://github.com/yandex/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) * Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/yandex/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) * Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) -* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/yandex/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/yandex/ClickHouse/issues/6220) [#6343](https://github.com/yandex/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) * Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/yandex/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) * Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) * Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) @@ -209,6 +207,8 @@ * Fix performance bug in `Decimal` comparison. [#6380](https://github.com/yandex/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) ### Build/Testing/Packaging Improvement +* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/yandex/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/yandex/ClickHouse/issues/6220) [#6343](https://github.com/yandex/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) * Best effort for printing stack traces. Also added SIGPROF as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/yandex/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) * In debug version on Linux, increase OOM score. [#6152](https://github.com/yandex/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) * HDFS HA now work in debug build. [#6650](https://github.com/yandex/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) From d9c99457ee7951d98086bf19a29ec5784cd3f922 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:25:22 +0300 Subject: [PATCH 082/102] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46b908c7369..32b0c32a58e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,6 @@ * New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/yandex/ClickHouse/issues/5885). [#6477](https://github.com/yandex/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) * Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/yandex/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) * Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) -* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/yandex/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) * Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) * Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) * Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) @@ -125,6 +124,7 @@ ### Improvement * Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) * Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/yandex/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) +* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/yandex/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) * Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/yandex/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/yandex/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) * Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [#5053](https://github.com/yandex/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) From 061bcf2819c95989058bd71d5b22cd528562907c Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:29:32 +0300 Subject: [PATCH 083/102] Update CHANGELOG.md --- CHANGELOG.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32b0c32a58e..9abe32b6bbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,7 +119,7 @@ * Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) ### Security Fix -* Fix two vulnerabilities in Codecs in decompression phase. [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) ### Improvement * Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) @@ -195,11 +195,9 @@ * Add Comments for set index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) ### Performance Improvement +* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is enabled with `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) * Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with`ORDER` key in `MergeTree` tables. [#6054](https://github.com/yandex/ClickHouse/pull/6054) ([Anton Popov](https://github.com/CurtizJ)) -* Implement 'read in order' optimization with processors. [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) -* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) ([akuzm](https://github.com/akuzm)) -* Disable consecutive key optimization for `UInt8/16` LowCardinality columns. [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) +* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) * Slightly improve performance of `MemoryTracker`. [#6653](https://github.com/yandex/ClickHouse/pull/6653) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/yandex/ClickHouse/issues/6372) [#6074](https://github.com/yandex/ClickHouse/issues/6074) [#6438](https://github.com/yandex/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/yandex/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) From e17f54f5b5b0eed2f95792e913c4fc92cec3e58a Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:31:26 +0300 Subject: [PATCH 084/102] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9abe32b6bbc..ab111fb36cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * Suppport for cascaded materialized views. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) * Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) -* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. Added two new functions: `sigmoid` and `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov] * Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) * New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) * Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/yandex/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) @@ -197,6 +197,7 @@ ### Performance Improvement * Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is enabled with `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) * Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) * Slightly improve performance of `MemoryTracker`. [#6653](https://github.com/yandex/ClickHouse/pull/6653) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/yandex/ClickHouse/issues/6372) [#6074](https://github.com/yandex/ClickHouse/issues/6074) [#6438](https://github.com/yandex/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) From 36075fd7c64b210af422fc44aa5ec47409a69e9c Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:32:10 +0300 Subject: [PATCH 085/102] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab111fb36cd..ecb34884a7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -195,7 +195,7 @@ * Add Comments for set index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) ### Performance Improvement -* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is enabled with `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) +* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) * Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) From 572583715d833aae467c21f8b28e91ec075cd054 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:33:17 +0300 Subject: [PATCH 086/102] Update CHANGELOG.md --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecb34884a7b..2bb1afc56e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ * `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) * Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) * Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) -* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) * Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) @@ -186,7 +186,6 @@ * Get rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/yandex/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) * Text log simplification. [#6322](https://github.com/yandex/ClickHouse/pull/6322) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Every function in its own file, part 10. [#6321](https://github.com/yandex/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Metric log rectification. [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/yandex/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) * Formatting changes for `StringHashMap` PR [#5417](https://github.com/yandex/ClickHouse/issues/5417). [#6700](https://github.com/yandex/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) * Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/yandex/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) From 7c62336504d1d05faab4383f902278c9563ac91f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:35:54 +0300 Subject: [PATCH 087/102] Update CHANGELOG.md --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb1afc56e3..05c8c96426a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### New Feature * `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) * `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +* Consider unquoted `NULL` literal as `\N` (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) * Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) * Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) * New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) @@ -23,8 +24,6 @@ * Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) * Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) * Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) -* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) -* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/yandex/ClickHouse/issues/6182), [#6252](https://github.com/yandex/ClickHouse/issues/6252) [#6352](https://github.com/yandex/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) ### Experimental features * Input and output data format `Template`. Template format allows specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) @@ -131,7 +130,8 @@ * Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [#6754](https://github.com/yandex/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) * For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [#6250](https://github.com/yandex/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [#6255](https://github.com/yandex/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Consider unquoted `NULL` literal as `\N` (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) +* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) +* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/yandex/ClickHouse/issues/6182), [#6252](https://github.com/yandex/ClickHouse/issues/6252) [#6352](https://github.com/yandex/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) * When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [#6521](https://github.com/yandex/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) * Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [#6285](https://github.com/yandex/ClickHouse/issues/6285) [#6320](https://github.com/yandex/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) * ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [#4449](https://github.com/yandex/ClickHouse/issues/4449) [#6730](https://github.com/yandex/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) From fd6ecaa1ef80ecd3fb460287c7ce9dbedc75bc2c Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 00:05:04 +0300 Subject: [PATCH 088/102] Update CHANGELOG.md --- CHANGELOG.md | 113 ++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05c8c96426a..a28e8965317 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,16 +5,16 @@ * `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) * Consider unquoted `NULL` literal as `\N` (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) * Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) -* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) * New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/yandex/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) * Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) * Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/yandex/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) * Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/yandex/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/yandex/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Suppport for cascaded materialized views. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) * Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) ([akonyaev90](https://github.com/akonyaev90)) +* Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) [#6703](https://github.com/yandex/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) * Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov] * Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) * New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) @@ -22,28 +22,29 @@ * New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/yandex/ClickHouse/issues/5885). [#6477](https://github.com/yandex/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) * Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/yandex/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) * Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) +* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) * Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) * Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) -### Experimental features -* Input and output data format `Template`. Template format allows specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) -* Implementation of LIVE VIEW tables that were originally proposed in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). Note that currently only LIVE VIEW tables are supported. See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +### Experimental Feature +* Input and output data format `Template`. It allows to specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) +* Implementation of `LIVE VIEW` tables that were originally proposed in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. ### Bug Fix * Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/yandex/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) -* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/yandex/ClickHouse/issues/6136) [#6146](https://github.com/yandex/ClickHouse/issues/6146) [#6156](https://github.com/yandex/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) -* Fix column TTL with user defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user defaults. [#6796](https://github.com/yandex/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) +* Fix per-column TTL with non-trivial column defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user-specified column defaults. [#6796](https://github.com/yandex/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) * Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/yandex/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) +* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/yandex/ClickHouse/issues/6136) [#6146](https://github.com/yandex/ClickHouse/issues/6146) [#6156](https://github.com/yandex/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) * Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/yandex/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix bug introduced in query profiler which leads to endless recv from socket. [#6386](https://github.com/yandex/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) -* Removed extra verbose logging from MySQL handler [#6389](https://github.com/yandex/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Return ability to parse boolean settings from 'true' and 'false'. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) -* Fix crash in `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/yandex/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/yandex/ClickHouse/issues/6248) [#6374](https://github.com/yandex/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) -* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Removed extra verbose logging in MySQL interface [#6389](https://github.com/yandex/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Return ability to parse boolean settings from 'true' and 'false' in configuration file. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +* Fix crash in `quantile` and `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/yandex/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to Float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/yandex/ClickHouse/issues/6248) [#6374](https://github.com/yandex/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) * Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [#6205](https://github.com/yandex/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) * Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [#6208](https://github.com/yandex/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) * Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/yandex/ClickHouse/issues/6502) [#6617](https://github.com/yandex/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) * Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [#6249](https://github.com/yandex/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) * Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/yandex/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) @@ -53,7 +54,6 @@ * Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/yandex/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/yandex/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) -* Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [#6406](https://github.com/yandex/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/yandex/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) * Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/yandex/ClickHouse/issues/6401) [#6409](https://github.com/yandex/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/yandex/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) @@ -64,22 +64,19 @@ * Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [#6447](https://github.com/yandex/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) * Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [#6398](https://github.com/yandex/ClickHouse/issues/6398) [#6444](https://github.com/yandex/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) * Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/yandex/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to use not nullable types in JOINs with ```join_use_nulls``` enabled. [#6705](https://github.com/yandex/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) +* Allow to use not nullable types in JOINs with `join_use_nulls` enabled. [#6705](https://github.com/yandex/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) * Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [#6706](https://github.com/yandex/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/yandex/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/yandex/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Avoid deadlock in `REPLACE PARTITION`. [#6677](https://github.com/yandex/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` subqueries. [#6381](https://github.com/yandex/ClickHouse/issues/6381) [#6676](https://github.com/yandex/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) +* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [#6381](https://github.com/yandex/ClickHouse/issues/6381) [#6676](https://github.com/yandex/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) * Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/yandex/ClickHouse/pull/6779) ([proller](https://github.com/proller)) -* Fixed deserialization in `DataTypeAggregateFunction`. [6575](https://github.com/yandex/ClickHouse/issues/6575) [#6773](https://github.com/yandex/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fixed parsing of `AggregateFunction` values embedded in query. [6575](https://github.com/yandex/ClickHouse/issues/6575) [#6773](https://github.com/yandex/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) * Using `arrayReduce` for constant arguments may lead to segfault. [#6242](https://github.com/yandex/ClickHouse/issues/6242) [#6326](https://github.com/yandex/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [#6522](https://github.com/yandex/ClickHouse/issues/6522) [#6523](https://github.com/yandex/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/yandex/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) -* Fix crash when casting types to Decimal that do not support it. Throw exception instead. [#6297](https://github.com/yandex/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) -* Security issue. If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [#6297](https://github.com/yandex/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) * Fixed hang in `JSONExtractRaw` function. [#6195](https://github.com/yandex/ClickHouse/issues/6195) [#6198](https://github.com/yandex/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) * Fixes the regression while pushing to materialized view. [#6415](https://github.com/yandex/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) -* Fix `CSV` parser. [#6426](https://github.com/yandex/ClickHouse/issues/6426) [#6559](https://github.com/yandex/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) * Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [#6045](https://github.com/yandex/ClickHouse/issues/6045) [#6413](https://github.com/yandex/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) * Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [#6594](https://github.com/yandex/ClickHouse/issues/6594). [#6748](https://github.com/yandex/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) * Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/yandex/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) @@ -109,7 +106,6 @@ * Fix bug opened by [#4405](https://github.com/yandex/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/yandex/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) * Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/yandex/ClickHouse/issues/6214) [#6233](https://github.com/yandex/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/yandex/ClickHouse/issues/5547) [#6232](https://github.com/yandex/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix TSan crash in `LiveView` no users thread. [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) * Fix useless `AST` check in Set index. [#6510](https://github.com/yandex/ClickHouse/issues/6510) [#6651](https://github.com/yandex/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) * Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [#448](https://github.com/yandex/ClickHouse/issues/448) [#3484](https://github.com/yandex/ClickHouse/issues/3484) [#3450](https://github.com/yandex/ClickHouse/issues/3450) [#2878](https://github.com/yandex/ClickHouse/issues/2878) [#2285](https://github.com/yandex/ClickHouse/issues/2285) [#3796](https://github.com/yandex/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [#6316](https://github.com/yandex/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/yandex/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) @@ -119,6 +115,8 @@ ### Security Fix * Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) +* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvement * Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) @@ -147,18 +145,17 @@ * Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [#6801](https://github.com/yandex/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) * Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [#6759](https://github.com/yandex/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [#6757](https://github.com/yandex/ClickHouse/issues/6757). [#6756](https://github.com/yandex/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2*DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/yandex/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) +* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2 * DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/yandex/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) * Move AST alias interpreting logic out of parser that doesn't have to know anything about query semantics. [#6108](https://github.com/yandex/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) * Slightly more safe parsing of `NamesAndTypesList`. [#6408](https://github.com/yandex/ClickHouse/issues/6408). [#6410](https://github.com/yandex/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* clickhouse-copier: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/yandex/ClickHouse/pull/6577) ([proller](https://github.com/proller)) +* `clickhouse-copier`: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/yandex/ClickHouse/pull/6577) ([proller](https://github.com/proller)) * Added optional message argument in `throwIf`. ([#5772](https://github.com/yandex/ClickHouse/issues/5772)) [#6329](https://github.com/yandex/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) -* Server exception got while sending insertion data by is now being processed in client as well. [#5891](https://github.com/yandex/ClickHouse/issues/5891) [#6711](https://github.com/yandex/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) +* Server exception got while sending insertion data is now being processed in client as well. [#5891](https://github.com/yandex/ClickHouse/issues/5891) [#6711](https://github.com/yandex/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) * Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [#6600](https://github.com/yandex/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [#6785](https://github.com/yandex/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) * Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) 'lock-order-inversion'. [#6740](https://github.com/yandex/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) * Better information messages about lack of Linux capabilities. Logging fatal errors with "fatal" level, that will make it easier to find in `system.text_log`. [#6441](https://github.com/yandex/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/yandex/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) -* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY/SORT`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/yandex/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/yandex/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`, `ORDER BY`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/yandex/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/yandex/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false "Attempt to acquire exclusive lock recursively" messages. [#6771](https://github.com/yandex/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [#6524](https://github.com/yandex/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) * Added `mysql_native_password` authentication plugin to MySQL compatibility server. [#6194](https://github.com/yandex/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) @@ -181,38 +178,35 @@ * Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/yandex/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) * `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [#6191](https://github.com/yandex/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) * Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [#6511](https://github.com/yandex/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) -* Improve code quality of `LiveView`. [#6619](https://github.com/yandex/ClickHouse/pull/6619) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Make PairNoInit a simple struct. [#6277](https://github.com/yandex/ClickHouse/pull/6277) ([akuzm](https://github.com/akuzm)) -* Get rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/yandex/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) -* Text log simplification. [#6322](https://github.com/yandex/ClickHouse/pull/6322) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Every function in its own file, part 10. [#6321](https://github.com/yandex/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/yandex/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) -* Formatting changes for `StringHashMap` PR [#5417](https://github.com/yandex/ClickHouse/issues/5417). [#6700](https://github.com/yandex/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) -* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/yandex/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) -* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/yandex/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) -* Refactoring of settings. [#6689](https://github.com/yandex/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) -* Add Comments for set index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) ### Performance Improvement * Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) -* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) -* Slightly improve performance of `MemoryTracker`. [#6653](https://github.com/yandex/ClickHouse/pull/6653) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/yandex/ClickHouse/issues/6372) [#6074](https://github.com/yandex/ClickHouse/issues/6074) [#6438](https://github.com/yandex/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/yandex/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) Using Danila Kutenin variant to make fastops working [#6317](https://github.com/yandex/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) +* Improved performance of `simdjson` library by getting rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/yandex/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) * Pre-fault pages when allocating memory with `mmap()`. [#6667](https://github.com/yandex/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) * Fix performance bug in `Decimal` comparison. [#6380](https://github.com/yandex/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) ### Build/Testing/Packaging Improvement +* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/yandex/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/yandex/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/yandex/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/yandex/ClickHouse/issues/6220) [#6343](https://github.com/yandex/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) -* Best effort for printing stack traces. Also added SIGPROF as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/yandex/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* In debug version on Linux, increase OOM score. [#6152](https://github.com/yandex/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) +* Best effort for printing stack traces. Also added `SIGPROF` as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/yandex/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Every function in its own file, part 10. [#6321](https://github.com/yandex/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/yandex/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) +* Formatting changes for `StringHashMap` PR [#5417](https://github.com/yandex/ClickHouse/issues/5417). [#6700](https://github.com/yandex/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) +* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/yandex/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) +* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/yandex/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) +* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/yandex/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) +* Refactoring of settings. [#6689](https://github.com/yandex/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) +* Add comments for `set` index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) +* Increase OOM score in debug version on Linux. [#6152](https://github.com/yandex/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) * HDFS HA now work in debug build. [#6650](https://github.com/yandex/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) * Added a test to `transform_query_for_external_database`. [#6388](https://github.com/yandex/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add test for multiple materialized views for Kafka table. [#6509](https://github.com/yandex/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) -* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/yandex/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Make a better build scheme. [#6500](https://github.com/yandex/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) * Fixed `test_external_dictionaries` integration in case it was executed under non root user. [#6507](https://github.com/yandex/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [#6204](https://github.com/yandex/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) @@ -240,22 +234,19 @@ * Switched from `boost::filesystem` to `std::filesystem` where appropriate. [#6253](https://github.com/yandex/ClickHouse/pull/6253) [#6385](https://github.com/yandex/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added RPM packages to website. [#6251](https://github.com/yandex/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add a test for fixed `Unknown identifier` exception in `IN` section. [#6708](https://github.com/yandex/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) -* Added test for ORC input format. [#6703](https://github.com/yandex/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) * Simplify `shared_ptr_helper` because people facing difficulties understanding it. [#6675](https://github.com/yandex/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added performance tests for fixed Gorilla and DoubleDelta codec. [#6179](https://github.com/yandex/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) * Split the integration test `test_dictionaries` into 4 separate tests. [#6776](https://github.com/yandex/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix PVS warning in `PipelineExecutor`. [#6777](https://github.com/yandex/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Allow to use library dictionary source with ASan. [#6482](https://github.com/yandex/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix PVS-Studio warning in `PipelineExecutor`. [#6777](https://github.com/yandex/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Allow to use `library` dictionary source with ASan. [#6482](https://github.com/yandex/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added option to generate changelog from a list of PRs. [#6350](https://github.com/yandex/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Lock the `TinyLog` storage when reading. [#6226](https://github.com/yandex/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) * Check for broken symlinks in CI. [#6634](https://github.com/yandex/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Increase timeout for "stack overflow" test because it may take a long time in debug build. [#6637](https://github.com/yandex/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/yandex/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added a check for double whitespaces. [#6643](https://github.com/yandex/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `new/delete` memory tracking then build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/yandex/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix `new/delete` memory tracking when build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/yandex/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) * Enable back the check of undefined symbols while linking. [#6453](https://github.com/yandex/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) -* Avoid rebuilding `hyperscan` each day. [#6307](https://github.com/yandex/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/yandex/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid rebuilding `hyperscan` every day. [#6307](https://github.com/yandex/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed UBSan report in `ProtobufWriter`. [#6163](https://github.com/yandex/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Don't allow to use query profiler with sanitizers because it is not compatible. [#6769](https://github.com/yandex/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add test for reloading a dictionary after fail by timer. [#6114](https://github.com/yandex/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) @@ -263,24 +254,24 @@ * Added a test for bad URIs. [#6493](https://github.com/yandex/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [#6346](https://github.com/yandex/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Added `gcc-9` support to `docker/builder` container that builds image locally. [#6333](https://github.com/yandex/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Test for primary index with `lowCardinality(String)`. [#5044](https://github.com/yandex/ClickHouse/issues/5044) [#6219](https://github.com/yandex/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) -* Using Danila Kutenin variant to make fastops working [#6317](https://github.com/yandex/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Test for primary key with `LowCardinality(String)`. [#5044](https://github.com/yandex/ClickHouse/issues/5044) [#6219](https://github.com/yandex/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) * Fixed tests affected by slow stack traces printing. [#6315](https://github.com/yandex/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add a test case for crash in `groupUniqArray` fixed in [#6029](https://github.com/yandex/ClickHouse/pull/6029). [#4402](https://github.com/yandex/ClickHouse/issues/4402) [#6129](https://github.com/yandex/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) * Fixed indices mutations tests. [#6645](https://github.com/yandex/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) * Attempt to fix performance test. [#6392](https://github.com/yandex/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) * In performance test, do not read query log for queries we didn't run. [#6427](https://github.com/yandex/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) * Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [#6428](https://github.com/yandex/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Updated tests for `send_logs_level` setting. [#6207](https://github.com/yandex/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Fix build under gcc-8.2. [#6196](https://github.com/yandex/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) * Fix build with internal libc++. [#6724](https://github.com/yandex/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) * Fix shared build with `rdkafka` library [#6101](https://github.com/yandex/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) -* Fixes for Mac OS build. [#6390](https://github.com/yandex/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/yandex/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) -* Fix splitted build. [#6618](https://github.com/yandex/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixes for Mac OS build (incomplete). [#6390](https://github.com/yandex/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/yandex/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) +* Fix "splitted" build. [#6618](https://github.com/yandex/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Other build fixes: [#6186](https://github.com/yandex/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/yandex/ClickHouse/pull/6486) [#6348](https://github.com/yandex/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/yandex/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/yandex/ClickHouse/pull/6016) [#6421](https://github.com/yandex/ClickHouse/pull/6421) [#6491](https://github.com/yandex/ClickHouse/pull/6491) ([proller](https://github.com/proller)) +* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) ### Backward Incompatible Change +* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/yandex/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/yandex/ClickHouse/issues/5126) [#6351](https://github.com/yandex/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) @@ -297,7 +288,7 @@ ## ClickHouse release 19.13.2.19, 2019-08-14 ### New Feature -* Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [#4247](https://github.com/yandex/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [#6124](https://github.com/yandex/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6250](https://github.com/yandex/ClickHouse/pull/6250) [#6283](https://github.com/yandex/ClickHouse/pull/6283) [#6386](https://github.com/yandex/ClickHouse/pull/6386) +* Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [#4247](https://github.com/yandex/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [#6124](https://github.com/yandex/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6250](https://github.com/yandex/ClickHouse/pull/6250) [#6283](https://github.com/yandex/ClickHouse/pull/6283) [#6386](https://github.com/yandex/ClickHouse/pull/6386) * Allow to specify a list of columns with `COLUMNS('regexp')` expression that works like a more sophisticated variant of `*` asterisk. [#5951](https://github.com/yandex/ClickHouse/pull/5951) ([mfridental](https://github.com/mfridental)), ([alexey-milovidov](https://github.com/alexey-milovidov)) * `CREATE TABLE AS table_function()` is now possible [#6057](https://github.com/yandex/ClickHouse/pull/6057) ([dimarub2000](https://github.com/dimarub2000)) * Adam optimizer for stochastic gradient descent is used by default in `stochasticLinearRegression()` and `stochasticLogisticRegression()` aggregate functions, because it shows good quality without almost any tuning. [#6000](https://github.com/yandex/ClickHouse/pull/6000) ([Quid37](https://github.com/Quid37)) @@ -306,7 +297,7 @@ * Now client receive logs from server with any desired level by setting `send_logs_level` regardless to the log level specified in server settings. [#5964](https://github.com/yandex/ClickHouse/pull/5964) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) ### Backward Incompatible Change -* The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distibuted tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behaviour but may lead to negligible performance difference. [#6043](https://github.com/yandex/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [#5625](https://github.com/yandex/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) +* The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distibuted tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behaviour but may lead to negligible performance difference. [#6043](https://github.com/yandex/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [#5625](https://github.com/yandex/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) ### Experimental features * New query processing pipeline. Use `experimental_use_processors=1` option to enable it. Use for your own trouble. [#4914](https://github.com/yandex/ClickHouse/pull/4914) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) From 8bda1cabb3b3f1fee8280c6e8f163bbcf63fab7f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 00:09:10 +0300 Subject: [PATCH 089/102] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a28e8965317..34151dcdc8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ ### Experimental Feature * Input and output data format `Template`. It allows to specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) -* Implementation of `LIVE VIEW` tables that were originally proposed in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. +* Implementation of `LIVE VIEW` tables that were originally proposed in [#2898](https://github.com/yandex/ClickHouse/pull/2898), prepared in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. ### Bug Fix * Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/yandex/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) From fd73186a74f38f4b91e17bbb6221f27283208e02 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 00:10:26 +0300 Subject: [PATCH 090/102] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34151dcdc8b..4845a9951a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ### New Feature * `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) * `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* Consider unquoted `NULL` literal as `\N` (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) +* Parse unquoted `NULL` literal as NULL (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) * Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) * New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) From 1327e71d803d38f36b5aeb395c3d7dccfde2faa8 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 02:01:06 +0300 Subject: [PATCH 091/102] Changed C-style cast to constructor-style cast. --- .../AggregateFunctionGroupBitmapData.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index 6243e60c9c5..220493a918c 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -503,15 +503,15 @@ public: for (const auto & x : small) { T val = x.getValue(); - if ((UInt64)val < min_val) + if (UInt64(val) < min_val) { - min_val = (UInt64)val; + min_val = UInt64(val); } } } else { - min_val = (UInt64)roaring_bitmap_minimum(rb); + min_val = UInt64(roaring_bitmap_minimum(rb)); } return min_val; } @@ -524,15 +524,15 @@ public: for (const auto & x : small) { T val = x.getValue(); - if ((UInt64)val > max_val) + if (UInt64(val) > max_val) { - max_val = (UInt64)val; + max_val = UInt64(val); } } } else { - max_val = (UInt64)roaring_bitmap_maximum(rb); + max_val = UInt64(roaring_bitmap_maximum(rb)); } return max_val; } From 24d1b9b8c2e1353cb16960399b281882e3e345c4 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 02:02:25 +0300 Subject: [PATCH 092/102] Update settings.md --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 46b3e41d7d9..d4e433803ae 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -473,7 +473,7 @@ The maximum number of query processing threads, excluding threads for retrieving This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least 'max_threads' number of threads, then 'max_threads' are used. -Default value: half the number of physical CPU cores. +Default value: the number of physical CPU cores. If less than one SELECT query is normally run on a server at a time, set this parameter to a value slightly less than the actual number of processor cores. From ba492274e2d3556ae565d9d9b87700e236b128a2 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 02:03:29 +0300 Subject: [PATCH 093/102] Update settings.md --- docs/ru/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index c3518eb7f74..20017e88af4 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -479,7 +479,7 @@ ClickHouse использует этот параметр при чтении д Этот параметр относится к потокам, которые выполняют параллельно одни стадии конвейера выполнения запроса. Например, при чтении из таблицы, если есть возможность вычислять выражения с функциями, фильтровать с помощью WHERE и предварительно агрегировать для GROUP BY параллельно, используя хотя бы количество потоков max_threads, то используются max_threads. -Значение по умолчанию: 2. +Значение по умолчанию: количество процессорных ядер без учёта Hyper-Threading. Если на сервере обычно исполняется менее одного запроса SELECT одновременно, то выставите этот параметр в значение чуть меньше количества реальных процессорных ядер. From 4be887a80317b243f6d54c41379bcfbb03818589 Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 19 Sep 2019 12:16:10 +0300 Subject: [PATCH 094/102] docs(system_tables): Added info about system.contributors --- docs/en/operations/system_tables.md | 40 +++++++++++++++++++++++++++++ docs/ru/operations/system_tables.md | 39 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 85c6f755f87..d96e3591447 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -91,6 +91,46 @@ The `system.columns` table contains the following columns (the column type is sh - `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. - `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. +## system.contributors {#system_contributors} + +Contains information about contributors. Arbitrary selection. + +Columns: + +- `name` (String) — Contributor name. + +**Пример** + +```sql +SELECT * FROM system.contributors LIMIT 10 +``` + +```text +┌─name─────────────┐ +│ Olga Khvostikova │ +│ Max Vetrov │ +│ LiuYangkuan │ +│ svladykin │ +│ zamulla │ +│ Šimon Podlipský │ +│ BayoNet │ +│ Ilya Khomutov │ +│ Amy Krishnevsky │ +│ Loud_Scream │ +└──────────────────┘ +``` + +To find out yourself in the table, use a query: + +```sql +SELECT * FROM system.contributors WHERE name='Olga Khvostikova' +``` +```text +┌─name─────────────┐ +│ Olga Khvostikova │ +└──────────────────┘ +``` + ## system.databases This table contains a single String column called 'name' – the name of a database. diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 95d7846dad7..3b8512c90d4 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -81,6 +81,45 @@ user String — имя пользователя, которого использ - `is_in_primary_key` (UInt8) — флаг, показывающий включение столбца в первичный ключ. - `is_in_sampling_key` (UInt8) — флаг, показывающий включение столбца в ключ выборки. +## system.contributors {#system_contributors} + +Содержит информацию о контрибьютерах. Выборка произвольная. + +Столбцы: + +- `name` (String) — Имя контрибьютера. + +**Пример** + +```sql +SELECT * FROM system.contributors LIMIT 10 +``` +```text +┌─name─────────────┐ +│ Olga Khvostikova │ +│ Max Vetrov │ +│ LiuYangkuan │ +│ svladykin │ +│ zamulla │ +│ Šimon Podlipský │ +│ BayoNet │ +│ Ilya Khomutov │ +│ Amy Krishnevsky │ +│ Loud_Scream │ +└──────────────────┘ +``` + +Чтобы найти себя в таблице, выполните запрос: + +```sql +SELECT * FROM system.contributors WHERE name='Olga Khvostikova' +``` +```text +┌─name─────────────┐ +│ Olga Khvostikova │ +└──────────────────┘ +``` + ## system.databases Таблица содержит один столбец name типа String - имя базы данных. From d508d8e8ed3feb57449c0d631d597b833ccfe764 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 19 Sep 2019 13:05:25 +0300 Subject: [PATCH 095/102] Add link to SF meetup --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c9a9ed39696..45c626604fd 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ ClickHouse is an open-source column-oriented database management system that all ## Upcoming Events * [ClickHouse Meetup in Paris](https://www.eventbrite.com/e/clickhouse-paris-meetup-2019-registration-68493270215) on October 3. +* [ClickHouse Meetup in San Francisco](https://www.meetup.com/San-Francisco-Bay-Area-ClickHouse-Meetup/events/264242199/) on October 9. * [ClickHouse Meetup in Hong Kong](https://www.meetup.com/Hong-Kong-Machine-Learning-Meetup/events/263580542/) on October 17. * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. * [ClickHouse Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27. From c7663d92c6f2b7f6f6c445f555c32eedca3bf7af Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 19 Sep 2019 13:44:12 +0300 Subject: [PATCH 096/102] docs(system_tables): fixes --- docs/en/operations/system_tables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index d96e3591447..ccad77ebe9f 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -99,7 +99,7 @@ Columns: - `name` (String) — Contributor name. -**Пример** +**Example** ```sql SELECT * FROM system.contributors LIMIT 10 From dd1af8c2ffc0ee7a1f062525b3f889cbbc4aabfc Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 19 Sep 2019 13:46:14 +0300 Subject: [PATCH 097/102] docs(system_tables): fixes after review --- docs/en/operations/system_tables.md | 2 +- docs/ru/operations/system_tables.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index ccad77ebe9f..ca30f54813d 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -93,7 +93,7 @@ The `system.columns` table contains the following columns (the column type is sh ## system.contributors {#system_contributors} -Contains information about contributors. Arbitrary selection. +Contains information about contributors. All constributors in random order. Columns: diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 3b8512c90d4..f7848085a9e 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -83,7 +83,7 @@ user String — имя пользователя, которого использ ## system.contributors {#system_contributors} -Содержит информацию о контрибьютерах. Выборка произвольная. +Содержит информацию о контрибьютерах. Контрибьютеры выбираются произвольно. Столбцы: From b1d581e85bc137dc843c553ab41ace754f814e4b Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 13:58:50 +0300 Subject: [PATCH 098/102] Update system_tables.md --- docs/en/operations/system_tables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index ca30f54813d..a7da5c8970e 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -97,7 +97,7 @@ Contains information about contributors. All constributors in random order. Columns: -- `name` (String) — Contributor name. +- `name` (String) — Contributor (author) name from git log. **Example** From 9edad23576aadf955b499a8788e6a94953b3f6b1 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 14:00:11 +0300 Subject: [PATCH 099/102] Update system_tables.md --- docs/ru/operations/system_tables.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index f7848085a9e..4c66664b921 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -83,11 +83,11 @@ user String — имя пользователя, которого использ ## system.contributors {#system_contributors} -Содержит информацию о контрибьютерах. Контрибьютеры выбираются произвольно. +Содержит информацию о контрибьютерах. Контрибьютеры расположены в таблице в случайном порядке. Порядок определяется заново при каждом запросе. Столбцы: -- `name` (String) — Имя контрибьютера. +- `name` (String) — Имя контрибьютера (автора коммита) из git log. **Пример** From 9df10d8735638f0d05db22e78fa592e2bb26603f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 19 Sep 2019 14:00:56 +0300 Subject: [PATCH 100/102] Update system_tables.md --- docs/en/operations/system_tables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index a7da5c8970e..d5f38f51421 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -93,7 +93,7 @@ The `system.columns` table contains the following columns (the column type is sh ## system.contributors {#system_contributors} -Contains information about contributors. All constributors in random order. +Contains information about contributors. All constributors in random order. The order is random at query execution time. Columns: From 73dc11f24adb685be1acc215029701c36e7fb17f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 19 Sep 2019 14:03:07 +0300 Subject: [PATCH 101/102] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4845a9951a5..5d1d83f4c9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ * Suppport for cascaded materialized views. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) * Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) [#6703](https://github.com/yandex/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) -* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov] +* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) * New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) * Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/yandex/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) From b289d7e6a503f3e159c5f6ec951ec4f2fb643278 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 19 Sep 2019 17:41:49 +0300 Subject: [PATCH 102/102] Fix system contributors generating script --- dbms/src/Storages/System/StorageSystemContributors.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/System/StorageSystemContributors.sh b/dbms/src/Storages/System/StorageSystemContributors.sh index aea122df0dc..c4c4eb5ad30 100755 --- a/dbms/src/Storages/System/StorageSystemContributors.sh +++ b/dbms/src/Storages/System/StorageSystemContributors.sh @@ -2,11 +2,15 @@ set -x +# doesn't actually cd to directory, but return absolute path CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# cd to directory +cd $CUR_DIR CONTRIBUTORS_FILE=${CONTRIBUTORS_FILE=$CUR_DIR/StorageSystemContributors.generated.cpp} -git shortlog --summary | perl -lnE 's/^\s+\d+\s+(.+)/ "$1",/; next unless $1; say $_' > $CONTRIBUTORS_FILE.tmp +# if you don't specify HEAD here, without terminal `git shortlog` would expect input from stdin +git shortlog HEAD --summary | perl -lnE 's/^\s+\d+\s+(.+)/ "$1",/; next unless $1; say $_' > $CONTRIBUTORS_FILE.tmp # If git history not available - dont make target file if [ ! -s $CONTRIBUTORS_FILE.tmp ]; then