From 3711430d9f97401d80dff14eaab68008bb03132c Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 15 Jun 2023 16:14:56 +0200 Subject: [PATCH 01/24] Rename member fields of CityHash_v1_0_2::uint128: "first" -> "low64", "second" -> "high64". --- base/base/wide_integer_impl.h | 16 ++++++++++++++++ contrib/cityhash102/include/city.h | 19 ++++++++++++++++--- src/Compression/CompressedReadBufferBase.cpp | 16 ++++++++-------- src/Compression/CompressedWriteBuffer.cpp | 8 ++++---- .../DistributedAsyncInsertHeader.cpp | 4 ++-- .../MergeTree/DataPartStorageOnDiskBase.cpp | 2 +- .../MergeTree/MergeTreeDataPartChecksum.cpp | 8 ++++---- .../PartMetadataManagerWithCache.cpp | 8 ++++---- src/Storages/System/StorageSystemParts.cpp | 6 +++--- .../System/StorageSystemProjectionParts.cpp | 6 +++--- 10 files changed, 61 insertions(+), 32 deletions(-) diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index 4a80c176829..dc6a49694ae 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -15,6 +15,8 @@ #include #include +#include + // NOLINTBEGIN(*) /// Use same extended double for all platforms @@ -281,6 +283,14 @@ struct integer::_impl } } + constexpr static void wide_integer_from_cityhash_uint128(integer & self, const CityHash_v1_0_2::uint128 & value) noexcept + { + if constexpr (std::endian::native == std::endian::little) + wide_integer_from_tuple_like(self, std::make_pair(value.low64, value.high64)); + else + wide_integer_from_tuple_like(self, std::make_pair(value.high64, value.low64)); + } + /** * N.B. t is constructed from double, so max(t) = max(double) ~ 2^310 * the recursive call happens when t / 2^64 > 2^64, so there won't be more than 5 of them. @@ -1036,6 +1046,8 @@ constexpr integer::integer(T rhs) noexcept _impl::wide_integer_from_wide_integer(*this, rhs); else if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, rhs); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, rhs); else _impl::wide_integer_from_builtin(*this, rhs); } @@ -1051,6 +1063,8 @@ constexpr integer::integer(std::initializer_list il) noexcept _impl::wide_integer_from_wide_integer(*this, *il.begin()); else if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, *il.begin()); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, *il.begin()); else _impl::wide_integer_from_builtin(*this, *il.begin()); } @@ -1088,6 +1102,8 @@ constexpr integer & integer::operator=(T rhs) noexce { if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, rhs); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, rhs); else _impl::wide_integer_from_builtin(*this, rhs); return *this; diff --git a/contrib/cityhash102/include/city.h b/contrib/cityhash102/include/city.h index 77d4c988cdd..87363d16444 100644 --- a/contrib/cityhash102/include/city.h +++ b/contrib/cityhash102/include/city.h @@ -61,11 +61,24 @@ namespace CityHash_v1_0_2 typedef uint8_t uint8; typedef uint32_t uint32; typedef uint64_t uint64; -typedef std::pair uint128; +/// Represent an unsigned integer of 128 bits as it's used in CityHash. +/// Originally CityHash used `std::pair` instead of this struct, +/// however the members `first` and `second` could be easily confused so they were renamed to `low64` and `high64`: +/// `first` -> `low64`, `second` -> `high64`. +struct uint128 +{ + uint64 low64 = 0; + uint64 high64 = 0; -inline uint64 Uint128Low64(const uint128& x) { return x.first; } -inline uint64 Uint128High64(const uint128& x) { return x.second; } + uint128() = default; + uint128(uint64 low64_, uint64 high64_) : low64(low64_), high64(high64_) {} + friend bool operator ==(const uint128 & x, const uint128 & y) { return (x.low64 == y.low64) && (x.high64 == y.high64); } + friend bool operator !=(const uint128 & x, const uint128 & y) { return !(x == y); } +}; + +inline uint64 Uint128Low64(const uint128 & x) { return x.low64; } +inline uint64 Uint128High64(const uint128 & x) { return x.high64; } // Hash function for a byte array. uint64 CityHash64(const char *buf, size_t len); diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 278210d770a..662cd6bf337 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -49,8 +49,8 @@ static void validateChecksum(char * data, size_t size, const Checksum expected_c /// TODO mess up of endianness in error message. message << "Checksum doesn't match: corrupted data." - " Reference: " + getHexUIntLowercase(expected_checksum.first) + getHexUIntLowercase(expected_checksum.second) - + ". Actual: " + getHexUIntLowercase(calculated_checksum.first) + getHexUIntLowercase(calculated_checksum.second) + " Reference: " + getHexUIntLowercase(expected_checksum.low64) + getHexUIntLowercase(expected_checksum.high64) + + ". Actual: " + getHexUIntLowercase(calculated_checksum.low64) + getHexUIntLowercase(calculated_checksum.high64) + ". Size of compressed block: " + toString(size); const char * message_hardware_failure = "This is most likely due to hardware failure. " @@ -95,8 +95,8 @@ static void validateChecksum(char * data, size_t size, const Checksum expected_c } /// Check if the difference caused by single bit flip in stored checksum. - size_t difference = std::popcount(expected_checksum.first ^ calculated_checksum.first) - + std::popcount(expected_checksum.second ^ calculated_checksum.second); + size_t difference = std::popcount(expected_checksum.low64 ^ calculated_checksum.low64) + + std::popcount(expected_checksum.high64 ^ calculated_checksum.high64); if (difference == 1) { @@ -194,8 +194,8 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, { Checksum checksum; ReadBufferFromMemory checksum_in(own_compressed_buffer.data(), sizeof(checksum)); - readBinaryLittleEndian(checksum.first, checksum_in); - readBinaryLittleEndian(checksum.second, checksum_in); + readBinaryLittleEndian(checksum.low64, checksum_in); + readBinaryLittleEndian(checksum.high64, checksum_in); validateChecksum(compressed_buffer, size_compressed_without_checksum, checksum); } @@ -238,8 +238,8 @@ size_t CompressedReadBufferBase::readCompressedDataBlockForAsynchronous(size_t & { Checksum checksum; ReadBufferFromMemory checksum_in(own_compressed_buffer.data(), sizeof(checksum)); - readBinaryLittleEndian(checksum.first, checksum_in); - readBinaryLittleEndian(checksum.second, checksum_in); + readBinaryLittleEndian(checksum.low64, checksum_in); + readBinaryLittleEndian(checksum.high64, checksum_in); validateChecksum(compressed_buffer, size_compressed_without_checksum, checksum); } diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index cb2ee1140d0..f16330332ab 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -38,8 +38,8 @@ void CompressedWriteBuffer::nextImpl() CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(out_compressed_ptr, compressed_size); - writeBinaryLittleEndian(checksum.first, out); - writeBinaryLittleEndian(checksum.second, out); + writeBinaryLittleEndian(checksum.low64, out); + writeBinaryLittleEndian(checksum.high64, out); out.position() += compressed_size; } @@ -50,8 +50,8 @@ void CompressedWriteBuffer::nextImpl() CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(compressed_buffer.data(), compressed_size); - writeBinaryLittleEndian(checksum.first, out); - writeBinaryLittleEndian(checksum.second, out); + writeBinaryLittleEndian(checksum.low64, out); + writeBinaryLittleEndian(checksum.high64, out); out.write(compressed_buffer.data(), compressed_size); } diff --git a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp index 018c1d863bb..d815f671652 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp @@ -40,8 +40,8 @@ DistributedAsyncInsertHeader DistributedAsyncInsertHeader::read(ReadBufferFromFi { throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Checksum of extra info doesn't match: corrupted data. Reference: {}{}. Actual: {}{}.", - getHexUIntLowercase(expected_checksum.first), getHexUIntLowercase(expected_checksum.second), - getHexUIntLowercase(calculated_checksum.first), getHexUIntLowercase(calculated_checksum.second)); + getHexUIntLowercase(expected_checksum.low64), getHexUIntLowercase(expected_checksum.high64), + getHexUIntLowercase(calculated_checksum.low64), getHexUIntLowercase(calculated_checksum.high64)); } /// Read the parts of the header. diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 30776a8bc50..92e9005751e 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -400,7 +400,7 @@ void DataPartStorageOnDiskBase::backup( if (it != checksums.files.end()) { file_size = it->second.file_size; - file_hash = {it->second.file_hash.first, it->second.file_hash.second}; + file_hash = it->second.file_hash; } BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, copy_encrypted, file_size, file_hash); diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp index 78f68ea72fe..58ba7acb9ba 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp @@ -154,9 +154,9 @@ bool MergeTreeDataPartChecksums::readV2(ReadBuffer & in) assertString("\n\tsize: ", in); readText(sum.file_size, in); assertString("\n\thash: ", in); - readText(sum.file_hash.first, in); + readText(sum.file_hash.low64, in); assertString(" ", in); - readText(sum.file_hash.second, in); + readText(sum.file_hash.high64, in); assertString("\n\tcompressed: ", in); readText(sum.is_compressed, in); if (sum.is_compressed) @@ -164,9 +164,9 @@ bool MergeTreeDataPartChecksums::readV2(ReadBuffer & in) assertString("\n\tuncompressed size: ", in); readText(sum.uncompressed_size, in); assertString("\n\tuncompressed hash: ", in); - readText(sum.uncompressed_hash.first, in); + readText(sum.uncompressed_hash.low64, in); assertString(" ", in); - readText(sum.uncompressed_hash.second, in); + readText(sum.uncompressed_hash.high64, in); } assertChar('\n', in); diff --git a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp index 7c0aedf699b..7deae69750f 100644 --- a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp @@ -250,8 +250,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in part {} for {}. Expected: {}. Found {}.", part->name, file_path, - getHexUIntUppercase(disk_checksum.first) + getHexUIntUppercase(disk_checksum.second), - getHexUIntUppercase(cache_checksums[i].first) + getHexUIntUppercase(cache_checksums[i].second)); + getHexUIntUppercase(disk_checksum.low64) + getHexUIntUppercase(disk_checksum.high64), + getHexUIntUppercase(cache_checksums[i].low64) + getHexUIntUppercase(cache_checksums[i].high64)); disk_checksums.push_back(disk_checksum); continue; @@ -287,8 +287,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in projection part {} {}. Expected: {}. Found {}.", part->name, proj_name, - getHexUIntUppercase(disk_checksum.first) + getHexUIntUppercase(disk_checksum.second), - getHexUIntUppercase(cache_checksums[i].first) + getHexUIntUppercase(cache_checksums[i].second)); + getHexUIntUppercase(disk_checksum.low64) + getHexUIntUppercase(disk_checksum.high64), + getHexUIntUppercase(cache_checksums[i].low64) + getHexUIntUppercase(cache_checksums[i].high64)); disk_checksums.push_back(disk_checksum); } return results; diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 86ecb336b51..95bad0a20fe 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -252,17 +252,17 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } } diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index d2c6c3ef287..6508d062d37 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -221,17 +221,17 @@ void StorageSystemProjectionParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); } } From f1f0daa654755b2d12ec9548262adfe4e87fe9b6 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 15 Jun 2023 17:59:37 +0200 Subject: [PATCH 02/24] Show halves of checksums in "system.parts", "system.projection_parts" and error messages in the correct order. --- src/Compression/CompressedReadBufferBase.cpp | 4 ++-- src/Storages/Distributed/DistributedAsyncInsertHeader.cpp | 4 ++-- src/Storages/MergeTree/PartMetadataManagerWithCache.cpp | 8 ++++---- src/Storages/System/StorageSystemParts.cpp | 6 +++--- src/Storages/System/StorageSystemProjectionParts.cpp | 6 +++--- utils/checksum-for-compressed-block/main.cpp | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 662cd6bf337..bae52c8bece 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -49,8 +49,8 @@ static void validateChecksum(char * data, size_t size, const Checksum expected_c /// TODO mess up of endianness in error message. message << "Checksum doesn't match: corrupted data." - " Reference: " + getHexUIntLowercase(expected_checksum.low64) + getHexUIntLowercase(expected_checksum.high64) - + ". Actual: " + getHexUIntLowercase(calculated_checksum.low64) + getHexUIntLowercase(calculated_checksum.high64) + " Reference: " + getHexUIntLowercase(expected_checksum.high64) + getHexUIntLowercase(expected_checksum.low64) + + ". Actual: " + getHexUIntLowercase(calculated_checksum.high64) + getHexUIntLowercase(calculated_checksum.low64) + ". Size of compressed block: " + toString(size); const char * message_hardware_failure = "This is most likely due to hardware failure. " diff --git a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp index d815f671652..e1b54304f23 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp @@ -40,8 +40,8 @@ DistributedAsyncInsertHeader DistributedAsyncInsertHeader::read(ReadBufferFromFi { throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Checksum of extra info doesn't match: corrupted data. Reference: {}{}. Actual: {}{}.", - getHexUIntLowercase(expected_checksum.low64), getHexUIntLowercase(expected_checksum.high64), - getHexUIntLowercase(calculated_checksum.low64), getHexUIntLowercase(calculated_checksum.high64)); + getHexUIntLowercase(expected_checksum.high64), getHexUIntLowercase(expected_checksum.low64), + getHexUIntLowercase(calculated_checksum.high64), getHexUIntLowercase(calculated_checksum.low64)); } /// Read the parts of the header. diff --git a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp index 7deae69750f..324bd4bbaee 100644 --- a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp @@ -250,8 +250,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in part {} for {}. Expected: {}. Found {}.", part->name, file_path, - getHexUIntUppercase(disk_checksum.low64) + getHexUIntUppercase(disk_checksum.high64), - getHexUIntUppercase(cache_checksums[i].low64) + getHexUIntUppercase(cache_checksums[i].high64)); + getHexUIntUppercase(disk_checksum.high64) + getHexUIntUppercase(disk_checksum.low64), + getHexUIntUppercase(cache_checksums[i].high64) + getHexUIntUppercase(cache_checksums[i].low64)); disk_checksums.push_back(disk_checksum); continue; @@ -287,8 +287,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in projection part {} {}. Expected: {}. Found {}.", part->name, proj_name, - getHexUIntUppercase(disk_checksum.low64) + getHexUIntUppercase(disk_checksum.high64), - getHexUIntUppercase(cache_checksums[i].low64) + getHexUIntUppercase(cache_checksums[i].high64)); + getHexUIntUppercase(disk_checksum.high64) + getHexUIntUppercase(disk_checksum.low64), + getHexUIntUppercase(cache_checksums[i].high64) + getHexUIntUppercase(cache_checksums[i].low64)); disk_checksums.push_back(disk_checksum); } return results; diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 95bad0a20fe..b642f4b5088 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -252,17 +252,17 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } } diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 6508d062d37..05c83747c4d 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -221,17 +221,17 @@ void StorageSystemProjectionParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.low64) + getHexUIntLowercase(checksum.high64)); + columns[res_index++]->insert(getHexUIntLowercase(checksum.high64) + getHexUIntLowercase(checksum.low64)); } } diff --git a/utils/checksum-for-compressed-block/main.cpp b/utils/checksum-for-compressed-block/main.cpp index 4f9923e7638..d30a3798820 100644 --- a/utils/checksum-for-compressed-block/main.cpp +++ b/utils/checksum-for-compressed-block/main.cpp @@ -45,7 +45,7 @@ int main(int, char **) { auto flipped = flipBit(str, pos); auto checksum = CityHash_v1_0_2::CityHash128(flipped.data(), flipped.size()); - std::cout << getHexUIntLowercase(checksum.first) << getHexUIntLowercase(checksum.second) << "\t" << pos / 8 << ", " << pos % 8 << "\n"; + std::cout << getHexUIntLowercase(checksum.high64) << getHexUIntLowercase(checksum.low64) << "\t" << pos / 8 << ", " << pos % 8 << "\n"; } return 0; From 5eeda0a0d24ae14a78da79273870aec9fa6bd8a0 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 24 Jun 2023 22:17:53 +0200 Subject: [PATCH 03/24] Fix test 00961_checksums_in_system_parts_columns_table --- .../00961_checksums_in_system_parts_columns_table.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00961_checksums_in_system_parts_columns_table.reference b/tests/queries/0_stateless/00961_checksums_in_system_parts_columns_table.reference index 186f2feab79..4bf3cfe65a2 100644 --- a/tests/queries/0_stateless/00961_checksums_in_system_parts_columns_table.reference +++ b/tests/queries/0_stateless/00961_checksums_in_system_parts_columns_table.reference @@ -1 +1 @@ -20000101_1_1_0 test_00961 b5fce9c4ef1ca42ce4ed027389c208d2 fc3b062b646cd23d4c23d7f5920f89ae da96ff1e527a8a1f908ddf2b1d0af239 +20000101_1_1_0 test_00961 e4ed027389c208d2b5fce9c4ef1ca42c 4c23d7f5920f89aefc3b062b646cd23d 908ddf2b1d0af239da96ff1e527a8a1f From 71cded08ff2813f4c4757e71a773ca8cc0a293bf Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 25 Jun 2023 14:51:29 +0200 Subject: [PATCH 04/24] Remove unnecessary include from wide_integer_impl.h --- base/base/wide_integer_impl.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index dc6a49694ae..411841e6d9f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -15,8 +15,6 @@ #include #include -#include - // NOLINTBEGIN(*) /// Use same extended double for all platforms @@ -29,6 +27,8 @@ using FromDoubleIntermediateType = long double; using FromDoubleIntermediateType = boost::multiprecision::cpp_bin_float_double_extended; #endif +namespace CityHash_v1_0_2 { struct uint128; } + namespace wide { @@ -283,8 +283,11 @@ struct integer::_impl } } - constexpr static void wide_integer_from_cityhash_uint128(integer & self, const CityHash_v1_0_2::uint128 & value) noexcept + template + constexpr static void wide_integer_from_cityhash_uint128(integer & self, const CityHashUInt128 & value) noexcept { + static_assert(sizeof(item_count) >= 2); + if constexpr (std::endian::native == std::endian::little) wide_integer_from_tuple_like(self, std::make_pair(value.low64, value.high64)); else From 1f60a6ed4e1040623f7482a64a2ae493996be3e7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 26 Jun 2023 12:34:11 +0200 Subject: [PATCH 05/24] Fix --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 5 +---- src/Interpreters/Cache/FileSegment.cpp | 2 +- src/Interpreters/Cache/FileSegment.h | 2 -- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 76d54f9d27c..960d2a72410 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -510,9 +510,6 @@ bool CachedOnDiskReadBufferFromFile::completeFileSegmentAndGetNext() current_file_segment->use(); implementation_buffer = getImplementationBuffer(*current_file_segment); - if (read_type == ReadType::CACHED) - current_file_segment->incrementHitsCount(); - LOG_TEST( log, "New segment range: {}, old range: {}", current_file_segment->range().toString(), completed_range.toString()); @@ -857,7 +854,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() implementation_buffer = getImplementationBuffer(file_segments->front()); if (read_type == ReadType::CACHED) - file_segments->front().incrementHitsCount(); + file_segments->front().use(); } chassert(!internal_buffer.empty()); diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 95592fc7c12..a77f0726d74 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -898,7 +898,7 @@ void FileSegment::use() if (it) { auto cache_lock = cache->lockCache(); - it->use(cache_lock); + hits_count = it->use(cache_lock); } } diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h index 681c0d719e4..2e6bbe5657e 100644 --- a/src/Interpreters/Cache/FileSegment.h +++ b/src/Interpreters/Cache/FileSegment.h @@ -180,8 +180,6 @@ public: size_t getRefCount() const { return ref_count; } - void incrementHitsCount() { ++hits_count; } - size_t getCurrentWriteOffset(bool sync) const; size_t getFirstNonDownloadedOffset(bool sync) const; From 7d8d19d8003ebaaac910a6af802ee4874e1821f8 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 26 Jun 2023 14:27:13 +0200 Subject: [PATCH 06/24] Add test --- .../tests/gtest_lru_file_cache.cpp | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp index 3cba1e48e1e..9ff9f92afe4 100644 --- a/src/Interpreters/tests/gtest_lru_file_cache.cpp +++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp @@ -22,6 +22,8 @@ #include #include +#include +#include #include namespace fs = std::filesystem; @@ -862,3 +864,78 @@ TEST_F(FileCacheTest, temporaryData) ASSERT_LE(file_cache.getUsedCacheSize(), size_used_before_temporary_data); ASSERT_LE(file_cache.getFileSegmentsNum(), segments_used_before_temporary_data); } + +TEST_F(FileCacheTest, CachedReadBuffer) +{ + DB::ThreadStatus thread_status; + + /// To work with cache need query_id and query context. + std::string query_id = "query_id"; + + Poco::XML::DOMParser dom_parser; + std::string xml(R"CONFIG( +)CONFIG"); + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + getMutableContext().context->setConfig(config); + + auto query_context = DB::Context::createCopy(getContext().context); + query_context->makeQueryContext(); + query_context->setCurrentQueryId(query_id); + chassert(&DB::CurrentThread::get() == &thread_status); + DB::CurrentThread::QueryScope query_scope_holder(query_context); + + DB::FileCacheSettings settings; + settings.base_path = cache_base_path; + settings.max_file_segment_size = 5; + settings.max_size = 30; + settings.max_elements = 10; + settings.boundary_alignment = 1; + + ReadSettings read_settings; + read_settings.enable_filesystem_cache = 1; + read_settings.local_fs_method = LocalFSReadMethod::pread; + + std::string file_path = fs::current_path() / "test"; + auto read_buffer_creator = [&]() + { + return createReadBufferFromFileBase(file_path, read_settings, std::nullopt, std::nullopt); + }; + + auto wb = std::make_unique(file_path, DBMS_DEFAULT_BUFFER_SIZE); + std::string s(30, '*'); + wb->write(s.data(), s.size()); + wb->next(); + wb->finalize(); + + auto cache = std::make_shared(settings); + cache->initialize(); + auto key = cache->createKeyForPath(file_path); + + { + auto cached_buffer = std::make_shared( + file_path, key, cache, read_buffer_creator, read_settings, "test", s.size(), false, false, std::nullopt, nullptr); + + WriteBufferFromOwnString result; + copyData(*cached_buffer, result); + ASSERT_EQ(result.str(), s); + + assertEqual(cache->dumpQueue(), { Range(0, 4), Range(5, 9), Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29) }); + } + + { + ReadSettings modified_settings{read_settings}; + modified_settings.local_fs_buffer_size = 10; + modified_settings.remote_fs_buffer_size = 10; + + auto cached_buffer = std::make_shared( + file_path, key, cache, read_buffer_creator, modified_settings, "test", s.size(), false, false, std::nullopt, nullptr); + + cached_buffer->next(); + assertEqual(cache->dumpQueue(), { Range(5, 9), Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29), Range(0, 4) }); + + cached_buffer->position() = cached_buffer->buffer().end(); + cached_buffer->next(); + assertEqual(cache->dumpQueue(), {Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29), Range(0, 4), Range(5, 9) }); + } +} From 4da82d10d0a7eba0a10fc8f02889d1bd533f7b82 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 26 Jun 2023 17:57:46 +0200 Subject: [PATCH 07/24] Update gtest_lru_file_cache.cpp --- src/Interpreters/tests/gtest_lru_file_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp index 9ff9f92afe4..58b1302a72c 100644 --- a/src/Interpreters/tests/gtest_lru_file_cache.cpp +++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp @@ -893,7 +893,7 @@ TEST_F(FileCacheTest, CachedReadBuffer) settings.boundary_alignment = 1; ReadSettings read_settings; - read_settings.enable_filesystem_cache = 1; + read_settings.enable_filesystem_cache = true; read_settings.local_fs_method = LocalFSReadMethod::pread; std::string file_path = fs::current_path() / "test"; From d3b8b454f853c63da4b94ec97afdcb1528ffdc22 Mon Sep 17 00:00:00 2001 From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com> Date: Tue, 27 Jun 2023 19:19:58 +0200 Subject: [PATCH 08/24] Fix segfault in MathUnary --- src/Functions/FunctionMathUnary.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h index 6e4bff7122d..9f400932356 100644 --- a/src/Functions/FunctionMathUnary.h +++ b/src/Functions/FunctionMathUnary.h @@ -154,6 +154,8 @@ private: using ColVecType = ColumnVectorOrDecimal; const auto col_vec = checkAndGetColumn(col.column.get()); + if (col_vec == nullptr) + return false; return (res = execute(col_vec)) != nullptr; }; From 575f3513977a21a8fea5ff30116636f2fc9ac2f1 Mon Sep 17 00:00:00 2001 From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com> Date: Tue, 27 Jun 2023 18:34:12 +0000 Subject: [PATCH 09/24] add test --- tests/queries/0_stateless/02807_math_unary_crash.reference | 2 ++ tests/queries/0_stateless/02807_math_unary_crash.sql | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 tests/queries/0_stateless/02807_math_unary_crash.reference create mode 100644 tests/queries/0_stateless/02807_math_unary_crash.sql diff --git a/tests/queries/0_stateless/02807_math_unary_crash.reference b/tests/queries/0_stateless/02807_math_unary_crash.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02807_math_unary_crash.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02807_math_unary_crash.sql b/tests/queries/0_stateless/02807_math_unary_crash.sql new file mode 100644 index 00000000000..16c3ba1e0ae --- /dev/null +++ b/tests/queries/0_stateless/02807_math_unary_crash.sql @@ -0,0 +1,4 @@ +CREATE TABLE t10 (`c0` Int32) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO t10 (c0) FORMAT Values (-1); +SELECT 1 FROM t10 GROUP BY erf(-sign(t10.c0)); +SELECT 1 FROM t10 GROUP BY -sign(t10.c0); From 3e5abbbf48953288d5bcea4fab9f2431bd05873d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 27 Jun 2023 22:47:19 +0300 Subject: [PATCH 10/24] Update 02807_math_unary_crash.sql --- tests/queries/0_stateless/02807_math_unary_crash.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02807_math_unary_crash.sql b/tests/queries/0_stateless/02807_math_unary_crash.sql index 16c3ba1e0ae..fb693ac70f7 100644 --- a/tests/queries/0_stateless/02807_math_unary_crash.sql +++ b/tests/queries/0_stateless/02807_math_unary_crash.sql @@ -1,4 +1,6 @@ +DROP TABLE IF EXISTS t10; CREATE TABLE t10 (`c0` Int32) ENGINE = MergeTree ORDER BY tuple(); INSERT INTO t10 (c0) FORMAT Values (-1); SELECT 1 FROM t10 GROUP BY erf(-sign(t10.c0)); SELECT 1 FROM t10 GROUP BY -sign(t10.c0); +DROP TABLE t10; From 5a1bbe5a8d2f07960161100e0a17527e4bda6de2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:05:14 +0200 Subject: [PATCH 11/24] Update CachedOnDiskReadBufferFromFile.cpp --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 960d2a72410..5b42f41fbf3 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -852,9 +852,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() else { implementation_buffer = getImplementationBuffer(file_segments->front()); - - if (read_type == ReadType::CACHED) - file_segments->front().use(); + file_segments->front().use(); } chassert(!internal_buffer.empty()); From c42bf37a141decb206e405470c2af6d85145bf3f Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 28 Jun 2023 09:59:32 -0400 Subject: [PATCH 12/24] list the disk types --- .../engines/table-engines/mergetree-family/mergetree.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 42454af6feb..c67ac8fa4ef 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -756,6 +756,14 @@ If you perform the `SELECT` query between merges, you may get expired data. To a - [ttl_only_drop_parts](/docs/en/operations/settings/settings.md/#ttl_only_drop_parts) setting +## Disk types + +In addition to local block devices, ClickHouse supports other device types through table engines. These are the types: +- [S3](#table_engine-mergetree-s3) +- GCS (also supported using the [S3 table engine](#table_engine-mergetree-s3)) +- [Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) +- [HDFS](/docs/en/sql-reference/table-functions/hdfs.md) + ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} ### Introduction {#introduction} From bfcadabb927e5ea547c29df488d3fe6ea396a178 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 28 Jun 2023 10:30:49 -0400 Subject: [PATCH 13/24] add web disk type --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index c67ac8fa4ef..1f084fe075b 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -763,6 +763,7 @@ In addition to local block devices, ClickHouse supports other device types throu - GCS (also supported using the [S3 table engine](#table_engine-mergetree-s3)) - [Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) - [HDFS](/docs/en/sql-reference/table-functions/hdfs.md) +- [Web (read-only)](#web-storage) ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} @@ -944,6 +945,8 @@ configuration files; all the settings are in the CREATE/ATTACH query. The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk. ::: +#### Example dynamic web storage {#web-storage} + ```sql ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' ( From b392127304d8c14ce34bd86d0b8ca561e1559919 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 28 Jun 2023 11:00:07 -0400 Subject: [PATCH 14/24] add example web config --- .../mergetree-family/mergetree.md | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 1f084fe075b..b87c4d216cf 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -945,7 +945,7 @@ configuration files; all the settings are in the CREATE/ATTACH query. The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk. ::: -#### Example dynamic web storage {#web-storage} +#### Example dynamic web storage ```sql ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' @@ -1249,6 +1249,57 @@ Examples of working configurations can be found in integration tests directory ( Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. ::: +## Web storage (read-only) {#web-storage} + +Web storage can be used for read-only purposes. An example use is for hosting sample +data, or for migrating data. + +:::tip +Storage can also be configured temporarily within a query, if a web dataset is not expected +to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the +configuration file. +::: + +In this sample configuration: +- the disk is of type `web` +- the data is hosted at `http://nginx:80/test1/` +- a cache on local storage is used + +```xml + + + + + web + http://nginx:80/test1/ + + + cache + web + cached_web_cache/ + 100000000 + + + + + +
+ web +
+
+
+ + +
+ cached_web +
+
+
+
+
+
+``` + ## Virtual Columns {#virtual-columns} - `_part` — Name of a part. From dd3a744cef6e736bc68782fd79853a1535bdebb8 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 28 Jun 2023 11:17:16 -0400 Subject: [PATCH 15/24] add HDFS example --- .../mergetree-family/mergetree.md | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index b87c4d216cf..1b7f3263ab9 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -762,7 +762,7 @@ In addition to local block devices, ClickHouse supports other device types throu - [S3](#table_engine-mergetree-s3) - GCS (also supported using the [S3 table engine](#table_engine-mergetree-s3)) - [Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) -- [HDFS](/docs/en/sql-reference/table-functions/hdfs.md) +- [HDFS](#hdfs-storage) - [Web (read-only)](#web-storage) ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} @@ -1249,6 +1249,42 @@ Examples of working configurations can be found in integration tests directory ( Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. ::: +## HDFS storage {#hdfs-storage} + +In this sample configuration: +- the disk is of type `hdfs` +- the data is hosted at `hdfs://hdfs1:9000/clickhouse/` + +```xml + + + + + hdfs + hdfs://hdfs1:9000/clickhouse/ + true + + + local + / + + + + + +
+ hdfs +
+ + hdd + +
+
+
+
+
+``` + ## Web storage (read-only) {#web-storage} Web storage can be used for read-only purposes. An example use is for hosting sample From 45cd7f35cdfb3c1ed17cd2451468761aa25a6bfb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 22 Jun 2023 02:16:56 +0200 Subject: [PATCH 16/24] Use clickhouse/integration-helper from changed images --- docker/test/integration/runner/dockerd-entrypoint.sh | 2 ++ tests/integration/helpers/network.py | 11 +++++++---- tests/integration/runner | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index fe47fc90951..347d904d5c0 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -52,6 +52,8 @@ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=/clickhouse-config export CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH=/clickhouse-odbc-bridge export CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH=/clickhouse-library-bridge +export DOCKER_BASE_TAG=${DOCKER_BASE_TAG:=latest} +export DOCKER_HELPER_TAG=${DOCKER_HELPER_TAG:=latest} export DOCKER_MYSQL_GOLANG_CLIENT_TAG=${DOCKER_MYSQL_GOLANG_CLIENT_TAG:=latest} export DOCKER_DOTNET_CLIENT_TAG=${DOCKER_DOTNET_CLIENT_TAG:=latest} export DOCKER_MYSQL_JAVA_CLIENT_TAG=${DOCKER_MYSQL_JAVA_CLIENT_TAG:=latest} diff --git a/tests/integration/helpers/network.py b/tests/integration/helpers/network.py index 471aa2bdc2e..4859a8c5946 100644 --- a/tests/integration/helpers/network.py +++ b/tests/integration/helpers/network.py @@ -231,6 +231,9 @@ class _NetworkManager: def _ensure_container(self): if self._container is None or self._container_expire_time <= time.time(): + image_name = "clickhouse/integration-helper:" + os.getenv( + "DOCKER_HELPER_TAG", "latest" + ) for i in range(5): if self._container is not None: try: @@ -247,7 +250,7 @@ class _NetworkManager: time.sleep(i) image = subprocess.check_output( - "docker images -q clickhouse/integration-helper 2>/dev/null", shell=True + f"docker images -q {image_name} 2>/dev/null", shell=True ) if not image.strip(): print("No network image helper, will try download") @@ -256,16 +259,16 @@ class _NetworkManager: for i in range(5): try: subprocess.check_call( # STYLE_CHECK_ALLOW_SUBPROCESS_CHECK_CALL - "docker pull clickhouse/integration-helper", shell=True + f"docker pull {image_name}", shell=True ) break except: time.sleep(i) else: - raise Exception("Cannot pull clickhouse/integration-helper image") + raise Exception(f"Cannot pull {image_name} image") self._container = self._docker_client.containers.run( - "clickhouse/integration-helper", + image_name, auto_remove=True, command=("sleep %s" % self.container_exit_timeout), # /run/xtables.lock passed inside for correct iptables --wait diff --git a/tests/integration/runner b/tests/integration/runner index f658bac412b..301a707a78d 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -336,6 +336,8 @@ if __name__ == "__main__": env_tags += "-e {}={} ".format("DOCKER_MYSQL_PHP_CLIENT_TAG", tag) elif image == "clickhouse/postgresql-java-client": env_tags += "-e {}={} ".format("DOCKER_POSTGRESQL_JAVA_CLIENT_TAG", tag) + elif image == "clickhouse/integration-helper": + env_tags += "-e {}={} ".format("DOCKER_HELPER_TAG", tag) elif image == "clickhouse/integration-test": env_tags += "-e {}={} ".format("DOCKER_BASE_TAG", tag) elif image == "clickhouse/kerberized-hadoop": From eeb8cdbc19e69aab64b0da2de0b569d6e31f438f Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 22 Jun 2023 02:36:30 +0200 Subject: [PATCH 17/24] Add way to define additional urls in test reports --- tests/ci/upload_result_helper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index 150af7aff4a..fbb89ef8078 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional import os import logging @@ -58,14 +58,19 @@ def upload_results( test_results: TestResults, additional_files: List[str], check_name: str, + additional_urls: Optional[List[str]] = None, ) -> str: normalized_check_name = check_name.lower() for r in ((" ", "_"), ("(", "_"), (")", "_"), (",", "_"), ("/", "_")): normalized_check_name = normalized_check_name.replace(*r) + + # Preserve additional_urls to not modify the original one + original_additional_urls = additional_urls or [] s3_path_prefix = f"{pr_number}/{commit_sha}/{normalized_check_name}" additional_urls = process_logs( s3_client, additional_files, s3_path_prefix, test_results ) + additional_urls.extend(original_additional_urls) branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" From f23bf9c5acdd885d61ec7f68bdf8be1b1fa79dee Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 29 Jun 2023 10:04:36 +0800 Subject: [PATCH 18/24] Update redis.md --- .../engines/table-engines/integrations/redis.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 568179eb690..7ef87927bfd 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -44,9 +44,10 @@ Create a table in ClickHouse which allows to read data from Redis: ``` sql CREATE TABLE redis_table ( - `k` String, - `m` String, - `n` UInt32 + `key` String, + `v1` UInt32, + `v2` String, + `v3` Float32 ) ENGINE = Redis('redis1:6379') PRIMARY KEY(k); ``` @@ -111,9 +112,16 @@ Flush Redis db asynchronously. Also `Truncate` support SYNC mode. TRUNCATE TABLE redis_table SYNC; ``` +Join: + +Join with other tables. + +``` +SELECT * FROM redis_table JOIN merge_tree_table ON redis_table.key=merge_tree_table.key; +``` ## Limitations {#limitations} Redis engine also supports scanning queries, such as `where k > xx`, but it has some limitations: -1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing. See details in [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269) +1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing. See details in [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269). 2. During the scanning, keys could be created and deleted, so the resulting dataset can not represent a valid point in time. From f8f0b7d086d23f70a49be88233b19d152e99e3f4 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 29 Jun 2023 10:09:27 +0800 Subject: [PATCH 19/24] fix typo --- docs/en/engines/table-engines/integrations/redis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 7ef87927bfd..2697abcf30e 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -49,7 +49,7 @@ CREATE TABLE redis_table `v2` String, `v3` Float32 ) -ENGINE = Redis('redis1:6379') PRIMARY KEY(k); +ENGINE = Redis('redis1:6379') PRIMARY KEY(key); ``` Insert: From 635ab9f9af3894c42b69ef093cc34e64cefce219 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 29 Jun 2023 12:53:23 +0800 Subject: [PATCH 20/24] move redis_table to right --- docs/en/engines/table-engines/integrations/redis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 2697abcf30e..8086a6503b8 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -117,7 +117,7 @@ Join: Join with other tables. ``` -SELECT * FROM redis_table JOIN merge_tree_table ON redis_table.key=merge_tree_table.key; +SELECT * FROM redis_table JOIN merge_tree_table ON merge_tree_table.key=redis_table.key; ``` ## Limitations {#limitations} From 5656d18690bd00b0db48c1e12e48e0446e263465 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 29 Jun 2023 13:36:55 +0000 Subject: [PATCH 21/24] Update version_date.tsv and changelogs after v23.5.4.25-stable --- docker/keeper/Dockerfile | 2 +- docker/server/Dockerfile.alpine | 2 +- docker/server/Dockerfile.ubuntu | 2 +- docs/changelogs/v23.5.4.25-stable.md | 31 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 5 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 docs/changelogs/v23.5.4.25-stable.md diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 367f6043b90..6d53a6f4c51 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ esac ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release" -ARG VERSION="23.5.3.24" +ARG VERSION="23.5.4.25" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index e7e879fa95f..91b22346f13 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.5.3.24" +ARG VERSION="23.5.4.25" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 42ae81655d2..0ed0e4e1168 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -23,7 +23,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.5.3.24" +ARG VERSION="23.5.4.25" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docs/changelogs/v23.5.4.25-stable.md b/docs/changelogs/v23.5.4.25-stable.md new file mode 100644 index 00000000000..53d3a7c9c0a --- /dev/null +++ b/docs/changelogs/v23.5.4.25-stable.md @@ -0,0 +1,31 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.5.4.25-stable (190f962abcf) FIXME as compared to v23.5.3.24-stable (76f54616d3b) + +#### Improvement +* Backported in [#51235](https://github.com/ClickHouse/ClickHouse/issues/51235): Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#51255](https://github.com/ClickHouse/ClickHouse/issues/51255): Disable cache setting `do_not_evict_index_and_mark_files` (Was enabled in `23.5`). [#51222](https://github.com/ClickHouse/ClickHouse/pull/51222) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement +* Backported in [#51531](https://github.com/ClickHouse/ClickHouse/issues/51531): Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#51572](https://github.com/ClickHouse/ClickHouse/issues/51572): This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Query Cache: Try to fix bad cast from ColumnConst to ColumnVector [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). +* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). +* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix race azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix ParallelReadBuffer seek [#50820](https://github.com/ClickHouse/ClickHouse/pull/50820) ([Michael Kolupaev](https://github.com/al13n321)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 307ed97068f..5c8dd0d2481 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v23.5.4.25-stable 2023-06-29 v23.5.3.24-stable 2023-06-17 v23.5.2.7-stable 2023-06-10 v23.5.1.3174-stable 2023-06-09 From 6ec85f9faa548e50ec1a6b4a2d7868c9f7e0079a Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Thu, 29 Jun 2023 11:06:28 -0300 Subject: [PATCH 22/24] Update settings.md --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index cff65e049f3..cff13302cdc 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1322,7 +1322,7 @@ Connection pool size for PostgreSQL table engine and database engine. Default value: 16 -## postgresql_connection_pool_size {#postgresql-connection-pool-size} +## postgresql_connection_pool_wait_timeout {#postgresql-connection-pool-wait-timeout} Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool. From 4ee094cab1a4aed4091679e54e6ac146b2423693 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Thu, 29 Jun 2023 12:13:26 -0400 Subject: [PATCH 23/24] review comments --- .../table-engines/mergetree-family/mergetree.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 1b7f3263ab9..a1c2fbdbe50 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -758,12 +758,14 @@ If you perform the `SELECT` query between merges, you may get expired data. To a ## Disk types -In addition to local block devices, ClickHouse supports other device types through table engines. These are the types: -- [S3](#table_engine-mergetree-s3) -- GCS (also supported using the [S3 table engine](#table_engine-mergetree-s3)) -- [Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) -- [HDFS](#hdfs-storage) -- [Web (read-only)](#web-storage) +In addition to local block devices, ClickHouse supports these storage types: +- [`s3` for S3 and MinIO](#table_engine-mergetree-s3) +- [`gcs` for GCS](/docs/en/integrations/data-ingestion/gcs/index.md/#creating-a-disk) +- [`blob_storage_disk` for Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) +- [`hdfs` for HDFS](#hdfs-storage) +- [`web` for read-only from web](#web-storage) +- [`cache` for local caching](/docs/en/operations/storing-data.md/#using-local-cache) +- [`s3_plain` for backups to S3](/docs/en/operations/backup#backuprestore-using-an-s3-disk) ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} From 90129b92c1dd47c0aa86cd5ed7c8107758a74b51 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 29 Jun 2023 21:19:41 +0200 Subject: [PATCH 24/24] Update 23.6 changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2e7b021081..c1e0dba4465 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ * Added `Overlay` database engine to combine multiple databases into one. Added `Filesystem` database engine to represent a directory in the filesystem as a set of implicitly available tables with auto-detected formats and structures. A new `S3` database engine allows to read-only interact with s3 storage by representing a prefix as a set of tables. A new `HDFS` database engine allows to interact with HDFS storage in the same way. [#48821](https://github.com/ClickHouse/ClickHouse/pull/48821) ([alekseygolub](https://github.com/alekseygolub)). * The function `transform` as well as `CASE` with value matching started to support all data types. This closes [#29730](https://github.com/ClickHouse/ClickHouse/issues/29730). This closes [#32387](https://github.com/ClickHouse/ClickHouse/issues/32387). This closes [#50827](https://github.com/ClickHouse/ClickHouse/issues/50827). This closes [#31336](https://github.com/ClickHouse/ClickHouse/issues/31336). This closes [#40493](https://github.com/ClickHouse/ClickHouse/issues/40493). [#51351](https://github.com/ClickHouse/ClickHouse/pull/51351) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Added option `--rename_files_after_processing `. This closes [#34207](https://github.com/ClickHouse/ClickHouse/issues/34207). [#49626](https://github.com/ClickHouse/ClickHouse/pull/49626) ([alekseygolub](https://github.com/alekseygolub)). -* Add support for `APPEND` modifier in `INTO OUTFILE` clause. Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)). +* Add support for `TRUNCATE` modifier in `INTO OUTFILE` clause. Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)). * Add table engine `Redis` and table function `redis`. It allows querying external Redis servers. [#50150](https://github.com/ClickHouse/ClickHouse/pull/50150) ([JackyWoo](https://github.com/JackyWoo)). * Allow to skip empty files in file/s3/url/hdfs table functions using settings `s3_skip_empty_files`, `hdfs_skip_empty_files`, `engine_file_skip_empty_files`, `engine_url_skip_empty_files`. [#50364](https://github.com/ClickHouse/ClickHouse/pull/50364) ([Kruglov Pavel](https://github.com/Avogar)). * Add a new setting named `use_mysql_types_in_show_columns` to alter the `SHOW COLUMNS` SQL statement to display MySQL equivalent types when a client is connected via the MySQL compatibility port. [#49577](https://github.com/ClickHouse/ClickHouse/pull/49577) ([Thomas Panetti](https://github.com/tpanetti)). @@ -40,12 +40,12 @@ * Make multiple list requests to ZooKeeper in parallel to speed up reading from system.zookeeper table. [#51042](https://github.com/ClickHouse/ClickHouse/pull/51042) ([Alexander Gololobov](https://github.com/davenger)). * Speedup initialization of DateTime lookup tables for time zones. This should reduce startup/connect time of clickhouse-client especially in debug build as it is rather heavy. [#51347](https://github.com/ClickHouse/ClickHouse/pull/51347) ([Alexander Gololobov](https://github.com/davenger)). * Fix data lakes slowness because of synchronous head requests. (Related to Iceberg/Deltalake/Hudi being slow with a lot of files). [#50976](https://github.com/ClickHouse/ClickHouse/pull/50976) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)). * Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). #### Experimental Feature * Support parallel replicas with the analyzer. [#50441](https://github.com/ClickHouse/ClickHouse/pull/50441) ([Raúl Marín](https://github.com/Algunenano)). * Add random sleep before large merges/mutations execution to split load more evenly between replicas in case of zero-copy replication. [#51282](https://github.com/ClickHouse/ClickHouse/pull/51282) ([alesapin](https://github.com/alesapin)). +* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)). #### Improvement * Relax the thresholds for "too many parts" to be more modern. Return the backpressure during long-running insert queries. [#50856](https://github.com/ClickHouse/ClickHouse/pull/50856) ([Alexey Milovidov](https://github.com/alexey-milovidov)).