From 765a5e4685daf1983d083bc71cf8872e730cad7f Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 31 Jul 2014 17:58:49 +0400 Subject: [PATCH 001/127] clickhouse-test: small fix. [#METR-2807] --- dbms/tests/clickhouse-test | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 225c83abf72..92375f645c4 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -34,6 +34,10 @@ ERRORS=0 if [ "$1" == "--zookeeper" ]; then ZOOKEEPER=1 + shift +elif [ "$1" == "--no-zookeeper" ]; then + ZOOKEEPER=0 + shift elif grep -q ' Date: Fri, 1 Aug 2014 13:30:56 +0400 Subject: [PATCH 002/127] active_parts.py: small improvement. [#METR-2807] --- dbms/src/Storages/tests/active_parts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/tests/active_parts.py b/dbms/src/Storages/tests/active_parts.py index 24fb4a5c7e4..2199a6e5adf 100644 --- a/dbms/src/Storages/tests/active_parts.py +++ b/dbms/src/Storages/tests/active_parts.py @@ -30,7 +30,7 @@ for m in parts: if x1 >= x2 and y1 <= y2 and l1 < l2 and (x1, y1) != (x2, y2): # 2 contains 1 pass elif x1 > y2: # 1 is to the right of 2 - if x1 != y2 + 1: + if x1 != y2 + 1 and y1 != -1: print # чтобы на глаз было видно пропущенные номера (x2, y2, l2, s2) = (x1, y1, l1, s1) print s1 From 0ff766765448d9ce4a8966177292b81096ab21dd Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Aug 2014 13:32:31 +0400 Subject: [PATCH 003/127] Merge --- .../Storages/MergeTree/MergeTreeDataMerger.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp index a5122ecb4d5..a528abf1c7a 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp @@ -37,7 +37,7 @@ static const double DISK_USAGE_COEFFICIENT_TO_RESERVE = 1.4; /// 5) С ростом логарифма суммарного размера кусочков в мердже увеличиваем требование сбалансированности bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & parts, String & merged_name, size_t available_disk_space, - bool merge_anything_for_old_months, bool aggressive, bool only_small, const AllowedMergingPredicate & can_merge) + bool merge_anything_for_old_months, bool aggressive, bool only_small, const AllowedMergingPredicate & can_merge_callback) { MergeTreeData::DataParts data_parts = data.getDataParts(); @@ -66,6 +66,19 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa if (only_small) cur_max_bytes_to_merge_parts = data.settings.max_bytes_to_merge_parts_small; + /// Мемоизация для функции can_merge_callback. Результат вызова can_merge_callback для этого куска и предыдущего в data_parts. + std::map can_merge_with_previous; + auto can_merge = [&can_merge_with_previous, &can_merge_callback] + (const MergeTreeData::DataPartPtr & first, const MergeTreeData::DataPartPtr & second) -> bool + { + auto it = can_merge_with_previous.find(second); + if (it != can_merge_with_previous.end()) + return it->second; + bool res = can_merge_callback(first, second); + can_merge_with_previous[second] = res; + return res; + }; + /// Найдем суммарный размер еще не пройденных кусков (то есть всех). size_t size_in_bytes_of_remaining_parts = 0; for (const auto & part : data_parts) From 02f0ecf88662e06c296bfd9a68911cf6f9e89169 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Aug 2014 13:36:28 +0400 Subject: [PATCH 004/127] Added watchparts.sh for the unlikely case anyone else needs it. [#METR-2807] --- dbms/src/Storages/tests/watchparts.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 dbms/src/Storages/tests/watchparts.sh diff --git a/dbms/src/Storages/tests/watchparts.sh b/dbms/src/Storages/tests/watchparts.sh new file mode 100755 index 00000000000..ad1fc68d1ed --- /dev/null +++ b/dbms/src/Storages/tests/watchparts.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Принимает аргументом директорию с кусками. Постоянно показывает список активных кусков и количество всех кусков. + +watch "ls $1 | grep -Pc '^[0-9]{8}_[0-9]{8}_'; ls $1 | active_parts.py | grep -Pc '^[0-9]{8}_[0-9]{8}_'; ls $1 | active_parts.py" From 757aa8891df9f57f6c86f65b4981d21accab0a7a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Aug 2014 13:45:59 +0400 Subject: [PATCH 005/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 8243124c007..eba898d9508 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1521,7 +1521,9 @@ void StorageReplicatedMergeTree::partCheckThread() } } /// Если куска нет в ZooKeeper, удалим его локально. - else + /// Возможно, кусок кто-то только что записал, и еще не успел добавить в ZK. + /// Поэтому удаляем только если кусок старый (не очень надежно). + else if (part->modification_time + 5 * 60 < time(0)) { ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed); From 0b37543bd8ccdf49ed21316c248bd0e2c6aabf83 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Aug 2014 13:52:55 +0400 Subject: [PATCH 006/127] Merge --- .../Storages/StorageReplicatedMergeTree.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index eba898d9508..5297297d198 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1118,6 +1118,20 @@ void StorageReplicatedMergeTree::mergeSelectingThread() true, false, has_big_merge, can_merge)) break; + bool all_in_zk = true; + for (const auto & part : parts) + { + /// Если о каком-то из кусков нет информации в ZK, не будем сливать. + if (!zookeeper->exists(replica_path + "/parts/" + part->name)) + { + LOG_WARNING(log, "Part " << part->name << " exists locally but not in ZooKeeper."); + enqueuePartForCheck(part->name); + all_in_zk = false; + } + } + if (!all_in_zk) + break; + LogEntry entry; entry.type = LogEntry::MERGE_PARTS; entry.source_replica = replica_name; @@ -1567,11 +1581,6 @@ bool StorageReplicatedMergeTree::canMergeParts(const MergeTreeData::DataPartPtr virtual_parts.getContainingPart(right->name) != right->name) return false; - /// Если о каком-то из кусков нет информации в ZK, не будем сливать. - if (!zookeeper->exists(replica_path + "/parts/" + left->name) || - !zookeeper->exists(replica_path + "/parts/" + right->name)) - return false; - String month_name = left->name.substr(0, 6); /// Можно слить куски, если все номера между ними заброшены - не соответствуют никаким блокам. From 5533a047c52d006597e8346abe42509f21cefd2a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Aug 2014 14:04:36 +0400 Subject: [PATCH 007/127] active_parts.py: tiny improvement. [#METR-2807] --- dbms/src/Storages/tests/active_parts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Storages/tests/active_parts.py b/dbms/src/Storages/tests/active_parts.py index 2199a6e5adf..6519bd75c1f 100644 --- a/dbms/src/Storages/tests/active_parts.py +++ b/dbms/src/Storages/tests/active_parts.py @@ -36,3 +36,4 @@ for m in parts: print s1 else: raise Exception('invalid parts intersection: ' + s1 + ' and ' + s2) + print From c112605fea3dffe305e1eaf0b45a52180ea5d22e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 1 Aug 2014 23:14:21 +0400 Subject: [PATCH 008/127] dbms: fixed aggregate function uniqUpTo [#METR-12017]. --- .../AggregateFunctionUniqUpTo.h | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h index 9bff8d8207a..8a7a0b26933 100644 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h +++ b/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h @@ -19,8 +19,15 @@ namespace DB template struct __attribute__((__packed__)) AggregateFunctionUniqUpToData { + /** Если count == threshold + 1 - это значит, что "переполнилось" (значений больше threshold). + * В этом случае (например, после вызова функции merge), массив data не обязательно содержит инициализированные значения + * - пример: объединяем состояние, в котором мало значений, с другим состоянием, которое переполнилось; + * тогда выставляем count в threshold + 1, а значения из другого состояния не копируем. + */ UInt8 count = 0; - T data[0]; /// Данные идут после конца структуры. При вставке, делается линейный поиск. + + /// Данные идут после конца структуры. При вставке, делается линейный поиск. + T data[0]; size_t size() const @@ -31,17 +38,20 @@ struct __attribute__((__packed__)) AggregateFunctionUniqUpToData /// threshold - для скольки элементов есть место в data. void insert(T x, UInt8 threshold) { + /// Состояние уже переполнено - ничего делать не нужно. if (count > threshold) return; - size_t limit = std::min(count, threshold); - for (size_t i = 0; i < limit; ++i) + /// Линейный поиск совпадающего элемента. + for (size_t i = 0; i < count; ++i) if (data[i] == x) return; + /// Не нашли совпадающий элемент. Если есть место ещё для одного элемента - вставляем его. if (count < threshold) data[count] = x; + /// После увеличения count, состояние может оказаться переполненным. ++count; } @@ -52,19 +62,22 @@ struct __attribute__((__packed__)) AggregateFunctionUniqUpToData if (rhs.count > threshold) { + /// Если rhs переполнено, то выставляем у текущего состояния count тоже переполненным. count = rhs.count; return; } - size_t limit = std::min(rhs.count, threshold); - for (size_t i = 0; i < limit; ++i) + for (size_t i = 0; i < rhs.count; ++i) insert(rhs.data[i], threshold); } void write(WriteBuffer & wb, UInt8 threshold) const { - size_t limit = std::min(count, threshold); - wb.write(reinterpret_cast(this), sizeof(*this) + limit * sizeof(data[0])); + writeBinary(count, wb); + + /// Пишем значения, только если состояние не переполнено. Иначе они не нужны, а важен только факт того, что состояние переполнено. + if (count <= threshold) + wb.write(reinterpret_cast(this), count * sizeof(data[0])); } void readAndMerge(ReadBuffer & rb, UInt8 threshold) @@ -72,11 +85,14 @@ struct __attribute__((__packed__)) AggregateFunctionUniqUpToData UInt8 rhs_count; readBinary(rhs_count, rb); - if (rhs_count > threshold + 1) - throw Poco::Exception("Cannot read AggregateFunctionUniqUpToData: too large count."); + if (rhs_count > threshold) + { + /// Если rhs переполнено, то выставляем у текущего состояния count тоже переполненным. + count = rhs_count; + return; + } - size_t limit = std::min(rhs_count, threshold); - for (size_t i = 0; i < limit; ++i) + for (size_t i = 0; i < rhs_count; ++i) { T x; readBinary(x, rb); From 5d4ea3d827b4228e70a0f6c9c1b4934e13668166 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 1 Aug 2014 23:31:38 +0400 Subject: [PATCH 009/127] dbms: uniqUpTo: small fix [#METR-12017]. --- .../DB/AggregateFunctions/AggregateFunctionUniqUpTo.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h index 8a7a0b26933..79d122255da 100644 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h +++ b/dbms/include/DB/AggregateFunctions/AggregateFunctionUniqUpTo.h @@ -151,11 +151,13 @@ public: if (params.size() != 1) throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - threshold = apply_visitor(FieldVisitorConvertToNumber(), params[0]); + UInt64 threshold_param = apply_visitor(FieldVisitorConvertToNumber(), params[0]); - if (threshold > uniq_upto_max_threshold) + if (threshold_param > uniq_upto_max_threshold) throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(uniq_upto_max_threshold), ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + threshold = threshold_param; } void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const From 304f2f274da379943cf205123e26cac640069eeb Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 4 Aug 2014 12:12:08 +0400 Subject: [PATCH 010/127] active_parts.py: small improvements. [#METR-2807] --- dbms/src/Storages/tests/active_parts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/tests/active_parts.py b/dbms/src/Storages/tests/active_parts.py index 6519bd75c1f..978ea86eb25 100644 --- a/dbms/src/Storages/tests/active_parts.py +++ b/dbms/src/Storages/tests/active_parts.py @@ -23,14 +23,14 @@ for s in sys.stdin.read().split(): parts[m1] = [] parts[m1].append((i1, i2, l, s)) -for m in parts: - parts[m].sort(key=lambda (i1, i2, l, s): (i1, -i2, -l)) +for m, ps in sorted(parts.items()): + ps.sort(key=lambda (i1, i2, l, s): (i1, -i2, -l)) (x2, y2, l2, s2) = (-1, -1, -1, -1) - for x1, y1, l1, s1 in parts[m]: + for x1, y1, l1, s1 in ps: if x1 >= x2 and y1 <= y2 and l1 < l2 and (x1, y1) != (x2, y2): # 2 contains 1 pass elif x1 > y2: # 1 is to the right of 2 - if x1 != y2 + 1 and y1 != -1: + if x1 != y2 + 1 and y2 != -1: print # чтобы на глаз было видно пропущенные номера (x2, y2, l2, s2) = (x1, y1, l1, s1) print s1 From e9ef0705f34db3540c7ce24d534ca62d7c2aecf6 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 4 Aug 2014 15:41:59 +0400 Subject: [PATCH 011/127] Merge --- .../DB/Storages/MergeTree/DiskSpaceMonitor.h | 47 +++++++++++++++++-- .../Storages/MergeTree/MergeTreeDataMerger.h | 9 +++- .../Storages/MergeTree/DiskSpaceMonitor.cpp | 3 +- .../MergeTree/MergeTreeDataMerger.cpp | 20 +++++++- dbms/src/Storages/StorageMergeTree.cpp | 2 +- 5 files changed, 70 insertions(+), 11 deletions(-) diff --git a/dbms/include/DB/Storages/MergeTree/DiskSpaceMonitor.h b/dbms/include/DB/Storages/MergeTree/DiskSpaceMonitor.h index 5c81a0d4dfd..04a28e996c1 100644 --- a/dbms/include/DB/Storages/MergeTree/DiskSpaceMonitor.h +++ b/dbms/include/DB/Storages/MergeTree/DiskSpaceMonitor.h @@ -26,27 +26,51 @@ public: { try { - Poco::ScopedLock lock(DiskSpaceMonitor::reserved_bytes_mutex); + Poco::ScopedLock lock(DiskSpaceMonitor::mutex); if (DiskSpaceMonitor::reserved_bytes < size) { DiskSpaceMonitor::reserved_bytes = 0; - LOG_ERROR(&Logger::get("DiskSpaceMonitor"), "Unbalanced reservations; it's a bug"); + LOG_ERROR(&Logger::get("DiskSpaceMonitor"), "Unbalanced reservations size; it's a bug"); } else { DiskSpaceMonitor::reserved_bytes -= size; } + + if (DiskSpaceMonitor::reservation_count == 0) + { + LOG_ERROR(&Logger::get("DiskSpaceMonitor"), "Unbalanced reservation count; it's a bug"); + } + else + { + --DiskSpaceMonitor::reservation_count; + } } catch (...) { tryLogCurrentException("~DiskSpaceMonitor"); } } + + /// Изменить количество зарезервированного места. При увеличении не делается проверка, что места достаточно. + void update(size_t new_size) + { + Poco::ScopedLock lock(DiskSpaceMonitor::mutex); + DiskSpaceMonitor::reserved_bytes -= size; + size = new_size; + DiskSpaceMonitor::reserved_bytes += size; + } + + size_t getSize() const + { + return size; + } private: Reservation(size_t size_) : size(size_) { - Poco::ScopedLock lock(DiskSpaceMonitor::reserved_bytes_mutex); + Poco::ScopedLock lock(DiskSpaceMonitor::mutex); DiskSpaceMonitor::reserved_bytes += size; + ++DiskSpaceMonitor::reservation_count; } size_t size; }; @@ -65,7 +89,7 @@ public: /// Зарезервируем дополнительно 30 МБ. Когда я тестировал, statvfs показывал на несколько мегабайт больше свободного места, чем df. res -= std::min(res, 30 * (1ul << 20)); - Poco::ScopedLock lock(reserved_bytes_mutex); + Poco::ScopedLock lock(mutex); if (reserved_bytes > res) res = 0; @@ -75,6 +99,18 @@ public: return res; } + static size_t getReservedSpace() + { + Poco::ScopedLock lock(mutex); + return reserved_bytes; + } + + static size_t getReservationCount() + { + Poco::ScopedLock lock(mutex); + return reservation_count; + } + /// Если места (приблизительно) недостаточно, бросает исключение. static ReservationPtr reserve(const std::string & path, size_t size) { @@ -87,7 +123,8 @@ public: private: static size_t reserved_bytes; - static Poco::FastMutex reserved_bytes_mutex; + static size_t reservation_count; + static Poco::FastMutex mutex; }; } diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeDataMerger.h b/dbms/include/DB/Storages/MergeTree/MergeTreeDataMerger.h index 39a117d3f06..a0c06ce5dcd 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeDataMerger.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeDataMerger.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB { @@ -34,9 +35,13 @@ public: bool only_small, const AllowedMergingPredicate & can_merge); - /// Сливает куски. + /** Сливает куски. + * Если reservation != nullptr, то и дело уменьшает размер зарезервированного места + * приблизительно пропорционально количеству уже выписанных данных. + */ MergeTreeData::DataPartPtr mergeParts( - const MergeTreeData::DataPartsVector & parts, const String & merged_name, MergeTreeData::Transaction * out_transaction = nullptr); + const MergeTreeData::DataPartsVector & parts, const String & merged_name, + MergeTreeData::Transaction * out_transaction = nullptr, DiskSpaceMonitor::Reservation * disk_reservation = nullptr); /// Примерное количество места на диске, нужное для мерджа. С запасом. size_t estimateDiskSpaceForMerge(const MergeTreeData::DataPartsVector & parts); diff --git a/dbms/src/Storages/MergeTree/DiskSpaceMonitor.cpp b/dbms/src/Storages/MergeTree/DiskSpaceMonitor.cpp index 2175fa5c8fe..8aa67f741ef 100644 --- a/dbms/src/Storages/MergeTree/DiskSpaceMonitor.cpp +++ b/dbms/src/Storages/MergeTree/DiskSpaceMonitor.cpp @@ -4,6 +4,7 @@ namespace DB { size_t DiskSpaceMonitor::reserved_bytes; -Poco::FastMutex DiskSpaceMonitor::reserved_bytes_mutex; +size_t DiskSpaceMonitor::reservation_count; +Poco::FastMutex DiskSpaceMonitor::mutex; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp index a528abf1c7a..497152dcf70 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -200,7 +201,9 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa { disk_space_warning_time = now; LOG_WARNING(log, "Won't merge parts from " << first_part->name << " to " << last_part->name - << " because not enough free space: " << available_disk_space << " free and unreserved, " + << " because not enough free space: " << available_disk_space << " free and unreserved " + << "(" << DiskSpaceMonitor::getReservedSpace() << " reserved in " + << DiskSpaceMonitor::getReservationCount() << " chunks), " << cur_sum << " required now (+" << static_cast((DISK_USAGE_COEFFICIENT_TO_SELECT - 1.0) * 100) << "% on overhead); suppressing similar warnings for the next hour"); } @@ -260,7 +263,8 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa /// parts должны быть отсортированы. MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts( - const MergeTreeData::DataPartsVector & parts, const String & merged_name, MergeTreeData::Transaction * out_transaction) + const MergeTreeData::DataPartsVector & parts, const String & merged_name, + MergeTreeData::Transaction * out_transaction, DiskSpaceMonitor::Reservation * disk_reservation) { LOG_DEBUG(log, "Merging " << parts.size() << " parts: from " << parts.front()->name << " to " << parts.back()->name << " into " << merged_name); @@ -290,12 +294,15 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts( */ BlockInputStreams src_streams; + size_t sum_rows_approx = 0; + for (size_t i = 0; i < parts.size(); ++i) { MarkRanges ranges(1, MarkRange(0, parts[i]->size)); src_streams.push_back(new ExpressionBlockInputStream(new MergeTreeBlockInputStream( data.getFullPath() + parts[i]->name + '/', DEFAULT_MERGE_BLOCK_SIZE, union_column_names, data, parts[i], ranges, false, nullptr, ""), data.getPrimaryExpression())); + sum_rows_approx += parts[i]->size * data.index_granularity; } /// Порядок потоков важен: при совпадении ключа элементы идут в порядке номера потока-источника. @@ -332,10 +339,19 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts( merged_stream->readPrefix(); to->writePrefix(); + size_t rows_written = 0; + size_t initial_reservation = disk_reservation->getSize(); + Block block; while (!canceled && (block = merged_stream->read())) + { + rows_written += block.rows(); to->write(block); + if (disk_reservation) + disk_reservation->update(static_cast((1 - std::min(1., 1. * rows_written / sum_rows_approx)) * initial_reservation)); + } + if (canceled) throw Exception("Canceled merging parts", ErrorCodes::ABORTED); diff --git a/dbms/src/Storages/StorageMergeTree.cpp b/dbms/src/Storages/StorageMergeTree.cpp index 162688ceae3..dc02a613588 100644 --- a/dbms/src/Storages/StorageMergeTree.cpp +++ b/dbms/src/Storages/StorageMergeTree.cpp @@ -180,7 +180,7 @@ bool StorageMergeTree::merge(bool aggressive, BackgroundProcessingPool::Context } } - merger.mergeParts(merging_tagger->parts, merged_name); + merger.mergeParts(merging_tagger->parts, merged_name, nullptr, &*merging_tagger->reserved_space); return true; } From 9a50694f22f4cc2d7533d7dc5c909773c0bf02b5 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 4 Aug 2014 18:23:47 +0400 Subject: [PATCH 012/127] Merge --- dbms/include/DB/Storages/StorageReplicatedMergeTree.h | 1 + dbms/src/Storages/StorageReplicatedMergeTree.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 9acc7d1cf35..51dd441ff93 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -248,6 +248,7 @@ private: std::unique_ptr unreplicated_data; std::unique_ptr unreplicated_reader; std::unique_ptr unreplicated_merger; + Poco::FastMutex unreplicated_mutex; /// Для мерджей и удаления нереплицируемых кусков. /// Потоки: diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 5297297d198..d92fb4c2c28 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1892,11 +1892,12 @@ bool StorageReplicatedMergeTree::optimize() { /// Померджим какие-нибудь куски из директории unreplicated. /// TODO: Мерджить реплицируемые куски тоже. - /// TODO: Не давать вызывать это из нескольких потоков сразу: один кусок может принять участие в нескольких несовместимых слияниях. if (!unreplicated_data) return false; + Poco::ScopedLock lock(unreplicated_mutex); + unreplicated_data->clearOldParts(); MergeTreeData::DataPartsVector parts; From 6fd0f417b423e7521768d0d9d082b786fce12844 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 4 Aug 2014 19:48:03 +0400 Subject: [PATCH 013/127] Merge --- .../DB/Storages/StorageReplicatedMergeTree.h | 1 + .../MergeTree/MergeTreeDataMerger.cpp | 2 +- .../Storages/StorageReplicatedMergeTree.cpp | 39 +++++++++++-------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 51dd441ff93..5f82e157a82 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -140,6 +140,7 @@ private: Strings parts_to_merge; FuturePartTaggerPtr future_part_tagger; + bool currently_executing = false; void addResultToVirtualParts(StorageReplicatedMergeTree & storage) { diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp index 497152dcf70..4d453dd1dbd 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp @@ -340,7 +340,7 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts( to->writePrefix(); size_t rows_written = 0; - size_t initial_reservation = disk_reservation->getSize(); + size_t initial_reservation = disk_reservation ? disk_reservation->getSize() : 0; Block block; while (!canceled && (block = merged_stream->read())) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index d92fb4c2c28..13045302c4e 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -990,11 +990,12 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p { for (LogEntries::iterator it = queue.begin(); it != queue.end(); ++it) { - if (shouldExecuteLogEntry(*it)) + if (!it->currently_executing && shouldExecuteLogEntry(*it)) { entry = *it; entry.tagPartAsFuture(*this); - queue.erase(it); + queue.splice(queue.end(), queue, it); + it->currently_executing = true; have_work = true; break; } @@ -1014,14 +1015,15 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p try { - success = executeLogEntry(entry, pool_context); - - if (success) + if (executeLogEntry(entry, pool_context)) { auto code = zookeeper->tryRemove(replica_path + "/queue/" + entry.znode_name); + if (code != ZOK) LOG_ERROR(log, "Couldn't remove " << replica_path + "/queue/" + entry.znode_name << ": " - << zkutil::ZooKeeper::error2string(code) + ". There must be a bug somewhere. Ignoring it."); + << zkutil::ZooKeeper::error2string(code) + ". This shouldn't happen often."); + + success = true; } exception = false; @@ -1039,12 +1041,21 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p tryLogCurrentException(__PRETTY_FUNCTION__); } - if (!success) + /// Удалим задание из очереди или отметим, что мы его больше не выполняем. + /// Нельзя просто обратиться по заранее сохраненному итератору, потому что задание мог успеть удалить кто-то другой. + entry.future_part_tagger = nullptr; + Poco::ScopedLock lock(queue_mutex); + for (LogEntries::iterator it = queue.end(); it != queue.begin();) { - /// Добавим действие, которое не получилось выполнить, в конец очереди. - entry.future_part_tagger = nullptr; - Poco::ScopedLock lock(queue_mutex); - queue.push_back(entry); + --it; + if (it->znode_name == entry.znode_name) + { + if (success) + queue.erase(it); + else + it->currently_executing = false; + break; + } } /// Если не было исключения, не нужно спать. @@ -1453,15 +1464,11 @@ void StorageReplicatedMergeTree::partCheckThread() { Poco::ScopedLock lock(queue_mutex); - /** NOTE: Не удалятся записи в очереди, которые сейчас выполняются. - * Они пофейлятся и положат кусок снова в очередь на проверку. - * Расчитываем, что это редкая ситуация. - */ for (LogEntries::iterator it = queue.begin(); it != queue.end(); ) { if (it->new_part_name == part_name) { - zookeeper->remove(replica_path + "/queue/" + it->znode_name); + zookeeper->tryRemove(replica_path + "/queue/" + it->znode_name); queue.erase(it++); was_in_queue = true; } From 3ab7260c162b976a83053e9a9ad309202479317b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 4 Aug 2014 23:48:50 +0400 Subject: [PATCH 014/127] dbms: sending exception message even if part of data has been already sent [#METR-12034]. --- dbms/src/Server/HTTPHandler.cpp | 49 +++++++++++++++++++++++---------- dbms/src/Server/HTTPHandler.h | 16 +++++++++-- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/dbms/src/Server/HTTPHandler.cpp b/dbms/src/Server/HTTPHandler.cpp index 9f4f3690bb4..d9c65783e70 100644 --- a/dbms/src/Server/HTTPHandler.cpp +++ b/dbms/src/Server/HTTPHandler.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -28,7 +27,7 @@ namespace DB { -void HTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void HTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, Output & used_output) { LOG_TRACE(log, "Request URI: " << request.getURI()); @@ -47,13 +46,12 @@ void HTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net query_param += '\n'; /// Если указано compress, то будем сжимать результат. - SharedPtr out = new WriteBufferFromHTTPServerResponse(response); - SharedPtr out_maybe_compressed; + used_output.out = new WriteBufferFromHTTPServerResponse(response); if (parse(params.get("compress", "0"))) - out_maybe_compressed = new CompressedWriteBuffer(*out); + used_output.out_maybe_compressed = new CompressedWriteBuffer(*used_output.out); else - out_maybe_compressed = out; + used_output.out_maybe_compressed = used_output.out; /// Имя пользователя и пароль могут быть заданы как в параметрах URL, так и с помощью HTTP Basic authentification (и то, и другое не секъюрно). std::string user = params.get("user", "default"); @@ -138,7 +136,7 @@ void HTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net context.getSettingsRef().limits.readonly = true; Stopwatch watch; - executeQuery(*in, *out_maybe_compressed, context, query_plan); + executeQuery(*in, *used_output.out_maybe_compressed, context, query_plan); watch.stop(); if (query_plan) @@ -167,11 +165,13 @@ void HTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net LOG_INFO(log, "Quota:\n" << quota.toString()); /// Если не было эксепшена и данные ещё не отправлены - отправляются HTTP заголовки с кодом 200. - out->finalize(); + used_output.out->finalize(); } -void HTTPHandler::trySendExceptionToClient(std::stringstream & s, Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void HTTPHandler::trySendExceptionToClient(std::stringstream & s, + Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, + Output & used_output) { try { @@ -186,8 +186,25 @@ void HTTPHandler::trySendExceptionToClient(std::stringstream & s, Poco::Net::HTT } response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); - if (!response.sent()) + + if (!response.sent() && !used_output.out_maybe_compressed) + { + /// Ещё ничего не отправляли, и даже не знаем, нужно ли сжимать ответ. response.send() << s.str() << std::endl; + } + else if (used_output.out_maybe_compressed) + { + /** Отправим в использованный (возможно сжатый) поток сообщение об ошибке. + * Сообщение об ошибке может идти невпопад - после каких-то данных. + * Также стоит иметь ввиду, что мы могли уже отправить код 200. + */ + + std::string exception_message = s.str(); + writeString(exception_message, *used_output.out_maybe_compressed); + writeChar('\n', *used_output.out_maybe_compressed); + used_output.out_maybe_compressed->next(); + used_output.out->finalize(); + } } catch (...) { @@ -198,6 +215,8 @@ void HTTPHandler::trySendExceptionToClient(std::stringstream & s, Poco::Net::HTT void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) { + Output used_output; + try { bool is_browser = false; @@ -215,7 +234,7 @@ void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne if (request.getVersion() == Poco::Net::HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - processQuery(request, response); + processQuery(request, response, used_output); LOG_INFO(log, "Done processing query"); } catch (Exception & e) @@ -224,26 +243,26 @@ void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne s << "Code: " << e.code() << ", e.displayText() = " << e.displayText() << ", e.what() = " << e.what(); LOG_ERROR(log, s.str()); - trySendExceptionToClient(s, request, response); + trySendExceptionToClient(s, request, response, used_output); } catch (Poco::Exception & e) { std::stringstream s; s << "Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code() << ", e.displayText() = " << e.displayText() << ", e.what() = " << e.what(); - trySendExceptionToClient(s, request, response); + trySendExceptionToClient(s, request, response, used_output); } catch (std::exception & e) { std::stringstream s; s << "Code: " << ErrorCodes::STD_EXCEPTION << ". " << e.what(); - trySendExceptionToClient(s, request, response); + trySendExceptionToClient(s, request, response, used_output); } catch (...) { std::stringstream s; s << "Code: " << ErrorCodes::UNKNOWN_EXCEPTION << ". Unknown exception."; - trySendExceptionToClient(s, request, response); + trySendExceptionToClient(s, request, response, used_output); } } diff --git a/dbms/src/Server/HTTPHandler.h b/dbms/src/Server/HTTPHandler.h index 03c917e57c3..7357c7f5831 100644 --- a/dbms/src/Server/HTTPHandler.h +++ b/dbms/src/Server/HTTPHandler.h @@ -1,5 +1,6 @@ #pragma once +#include #include "Server.h" @@ -16,15 +17,26 @@ public: { } + struct Output + { + SharedPtr out; + /// Используется для выдачи ответа. Равен либо out, либо CompressedWriteBuffer(*out), в зависимости от настроек. + SharedPtr out_maybe_compressed; + }; + void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response); - void trySendExceptionToClient(std::stringstream & s, Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response); + + void trySendExceptionToClient(std::stringstream & s, + Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, + Output & used_output); private: Server & server; Logger * log; - void processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response); + /// Функция также инициализирует used_output. + void processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, Output & used_output); }; } From 7bc99645e2f0136e765816cd616cda9ace9f54f5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 5 Aug 2014 00:16:49 +0400 Subject: [PATCH 015/127] dbms: in case of exception, don't send data in buffer, if it hasn't already sent [#METR-12034]. --- dbms/src/DataStreams/JSONRowOutputStream.cpp | 2 -- dbms/src/Server/HTTPHandler.cpp | 7 +++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dbms/src/DataStreams/JSONRowOutputStream.cpp b/dbms/src/DataStreams/JSONRowOutputStream.cpp index 6794ebb52d6..9c43a8fa546 100644 --- a/dbms/src/DataStreams/JSONRowOutputStream.cpp +++ b/dbms/src/DataStreams/JSONRowOutputStream.cpp @@ -44,8 +44,6 @@ void JSONRowOutputStream::writePrefix() writeChar('\n', ostr); writeCString("\t\"data\":\n", ostr); writeCString("\t[\n", ostr); - - ostr.next(); } diff --git a/dbms/src/Server/HTTPHandler.cpp b/dbms/src/Server/HTTPHandler.cpp index d9c65783e70..96434d0777e 100644 --- a/dbms/src/Server/HTTPHandler.cpp +++ b/dbms/src/Server/HTTPHandler.cpp @@ -199,6 +199,13 @@ void HTTPHandler::trySendExceptionToClient(std::stringstream & s, * Также стоит иметь ввиду, что мы могли уже отправить код 200. */ + /** Если данные есть в буфере, но их ещё не отправили, то и не будем отправлять */ + if (used_output.out->count() - used_output.out->offset() == 0) + { + used_output.out_maybe_compressed->position() = used_output.out_maybe_compressed->buffer().begin(); + used_output.out->position() = used_output.out->buffer().begin(); + } + std::string exception_message = s.str(); writeString(exception_message, *used_output.out_maybe_compressed); writeChar('\n', *used_output.out_maybe_compressed); From cf5c4c07f38e490ca24d1aa89961bedec55288e8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 5 Aug 2014 02:10:56 +0400 Subject: [PATCH 016/127] dbms: updated comment [#METR-2944]. --- dbms/src/Storages/StorageFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/StorageFactory.cpp b/dbms/src/Storages/StorageFactory.cpp index 7d8a1507e7b..c9d0fb6f809 100644 --- a/dbms/src/Storages/StorageFactory.cpp +++ b/dbms/src/Storages/StorageFactory.cpp @@ -189,7 +189,7 @@ StoragePtr StorageFactory::get( } else if (endsWith(name, "MergeTree")) { - /** Движки [Replicated][Summing|Collapsing]MergeTree (6 комбинаций) + /** Движки [Replicated][Summing|Collapsing|Aggregating|]MergeTree (8 комбинаций) * В качестве аргумента для движка должно быть указано: * - (для Replicated) Путь к таблице в ZooKeeper * - (для Replicated) Имя реплики в ZooKeeper From 3adfe8d53674da4846f8cdf13244de8f86574c87 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 5 Aug 2014 12:57:17 +0400 Subject: [PATCH 017/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 13045302c4e..8ded9c49f96 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1889,8 +1889,9 @@ BlockOutputStreamPtr StorageReplicatedMergeTree::write(ASTPtr query) throw Exception("Table is in read only mode", ErrorCodes::TABLE_IS_READ_ONLY); String insert_id; - if (ASTInsertQuery * insert = typeid_cast(&*query)) - insert_id = insert->insert_id; + if (query) + if (ASTInsertQuery * insert = typeid_cast(&*query)) + insert_id = insert->insert_id; return new ReplicatedMergeTreeBlockOutputStream(*this, insert_id); } From 1a39b3bf5ce6548def85b34d1eb0d73d9b98df8c Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Thu, 31 Jul 2014 17:39:23 +0400 Subject: [PATCH 018/127] dbms: added checData method in TinyLog storage [#METR-11709] --- dbms/include/DB/Storages/IStorage.h | 3 +++ dbms/include/DB/Storages/StorageTinyLog.h | 10 ++++++++ dbms/src/Storages/StorageTinyLog.cpp | 31 ++++++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/dbms/include/DB/Storages/IStorage.h b/dbms/include/DB/Storages/IStorage.h index 97b07831a96..71ae27cf918 100644 --- a/dbms/include/DB/Storages/IStorage.h +++ b/dbms/include/DB/Storages/IStorage.h @@ -248,6 +248,9 @@ public: /// Поддерживается ли индекс в секции IN virtual bool supportsIndexForIn() const { return false; }; + /// проверяет валидность данных + virtual bool checkData() const { throw DB::Exception("Check query is not supported for " + getName() + " storage"); } + protected: IStorage() : is_dropped(false) {} diff --git a/dbms/include/DB/Storages/StorageTinyLog.h b/dbms/include/DB/Storages/StorageTinyLog.h index 0f00aac036d..20332d3fa45 100644 --- a/dbms/include/DB/Storages/StorageTinyLog.h +++ b/dbms/include/DB/Storages/StorageTinyLog.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB @@ -129,6 +130,7 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name); + bool checkData() const override; private: String path; String name; @@ -144,9 +146,17 @@ private: typedef std::map Files_t; Files_t files; + /// хранит размеры всех столбцов, чтобы проверять не побились ли они + using SizeFile = Poco::AutoPtr; + SizeFile size_file; + + Logger * log; + StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_); void addFile(const String & column_name, const IDataType & type, size_t level = 0); + + void updateSize(const std::string & column_name); }; } diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index 6cb7fface3f..dbae12b0391 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -264,7 +264,10 @@ void TinyLogBlockOutputStream::writeSuffix() { /// Заканчиваем запись. for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it) + { it->second->finalize(); + storage.updateSize(it->first); + } streams.clear(); } @@ -286,7 +289,9 @@ void TinyLogBlockOutputStream::write(const Block & block) StorageTinyLog::StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) - : path(path_), name(name_), columns(columns_), max_compress_block_size(max_compress_block_size_) + : path(path_), name(name_), columns(columns_), + max_compress_block_size(max_compress_block_size_), size_file(new Poco::Util::XMLConfiguration(path + "sizes.txt")), + log(&Logger::get("StorageTinyLog")) { if (columns->empty()) throw Exception("Empty list of columns passed to StorageTinyLog constructor", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); @@ -394,4 +399,28 @@ void StorageTinyLog::drop() it->second.data_file.remove(); } +bool StorageTinyLog::checkData() const +{ + bool size_is_wrong = false; + for (auto & pair : files) + { + auto & file = pair.second.data_file; + size_t expected_size = std::stoull(size_file->getString(pair.first + ".size")); + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); + size_is_wrong = true; + } + } + return size_is_wrong; +} + +void StorageTinyLog::updateSize(const std::string & column_name) +{ + auto & file = files[column_name].data_file; + size_file->setString(column_name + ".size", std::to_string(file.getSize())); +} + + } From e1c016cbac87e4281d3bde2f198c9c4aa325810e Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Thu, 31 Jul 2014 23:19:56 +0400 Subject: [PATCH 019/127] dbms.TinyLog: call updateSize after writing on disk [#METR-10969] --- dbms/include/DB/Storages/StorageTinyLog.h | 11 ++++- dbms/src/Storages/StorageTinyLog.cpp | 57 ++++++++++++++++------- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/dbms/include/DB/Storages/StorageTinyLog.h b/dbms/include/DB/Storages/StorageTinyLog.h index 20332d3fa45..9be09115b71 100644 --- a/dbms/include/DB/Storages/StorageTinyLog.h +++ b/dbms/include/DB/Storages/StorageTinyLog.h @@ -61,6 +61,9 @@ class TinyLogBlockOutputStream : public IBlockOutputStream { public: TinyLogBlockOutputStream(StorageTinyLog & storage_); + + ~TinyLogBlockOutputStream(); + void write(const Block & block); void writeSuffix(); private: @@ -91,6 +94,8 @@ private: void addStream(const String & name, const IDataType & type, size_t level = 0); void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns, size_t level = 0); + + void updateFileSizes(const FileStreams::const_iterator & begin, const FileStreams::const_iterator & end); }; @@ -131,6 +136,7 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name); bool checkData() const override; + private: String path; String name; @@ -146,17 +152,18 @@ private: typedef std::map Files_t; Files_t files; + std::string size_file_path; + /// хранит размеры всех столбцов, чтобы проверять не побились ли они using SizeFile = Poco::AutoPtr; SizeFile size_file; + SizeFile & sizeFile() { return size_file; } Logger * log; StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_); void addFile(const String & column_name, const IDataType & type, size_t level = 0); - - void updateSize(const std::string & column_name); }; } diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index dbae12b0391..367fd2b1ae0 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -264,10 +264,9 @@ void TinyLogBlockOutputStream::writeSuffix() { /// Заканчиваем запись. for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it) - { it->second->finalize(); - storage.updateSize(it->first); - } + + updateFileSizes(streams.begin(), streams.end()); streams.clear(); } @@ -290,22 +289,33 @@ void TinyLogBlockOutputStream::write(const Block & block) StorageTinyLog::StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) : path(path_), name(name_), columns(columns_), - max_compress_block_size(max_compress_block_size_), size_file(new Poco::Util::XMLConfiguration(path + "sizes.txt")), + max_compress_block_size(max_compress_block_size_), size_file(new Poco::Util::XMLConfiguration()), log(&Logger::get("StorageTinyLog")) { if (columns->empty()) throw Exception("Empty list of columns passed to StorageTinyLog constructor", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); + String full_path = path + escapeForFileName(name) + '/'; if (!attach) { /// создаём файлы, если их нет - String full_path = path + escapeForFileName(name) + '/'; if (0 != mkdir(full_path.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) && errno != EEXIST) throwFromErrno("Cannot create directory " + full_path, ErrorCodes::CANNOT_CREATE_DIRECTORY); } for (NamesAndTypesList::const_iterator it = columns->begin(); it != columns->end(); ++it) addFile(it->name, *it->type); + + try + { + size_file_path = full_path + "sizes.txt"; + size_file->load(size_file_path); + } + catch (const Poco::FileNotFoundException & e) + { + /// нормальная ситуация, для старых таблиц файла не существует + size_file->loadEmpty("yandex"); + } } StoragePtr StorageTinyLog::create(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) @@ -365,6 +375,7 @@ void StorageTinyLog::rename(const String & new_path_to_db, const String & new_da path = new_path_to_db; name = new_table_name; + size_file_path = path + escapeForFileName(name) + "/" + "sizes.txt"; for (Files_t::iterator it = files.begin(); it != files.end(); ++it) it->second.data_file = Poco::File(path + escapeForFileName(name) + '/' + Poco::Path(it->second.data_file.path()).getFileName()); @@ -401,26 +412,40 @@ void StorageTinyLog::drop() bool StorageTinyLog::checkData() const { - bool size_is_wrong = false; + bool sizes_are_correct = true; for (auto & pair : files) { - auto & file = pair.second.data_file; - size_t expected_size = std::stoull(size_file->getString(pair.first + ".size")); - size_t real_size = file.getSize(); - if (real_size != expected_size) + if (size_file->has(pair.first)) { - LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); - size_is_wrong = true; + auto & file = pair.second.data_file; + size_t expected_size = std::stoull(size_file->getString(pair.first + ".size")); + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); + sizes_are_correct = false; + } } } - return size_is_wrong; + return sizes_are_correct; } -void StorageTinyLog::updateSize(const std::string & column_name) +void TinyLogBlockOutputStream::updateFileSizes(const FileStreams::const_iterator & begin, + const FileStreams::const_iterator & end) { - auto & file = files[column_name].data_file; - size_file->setString(column_name + ".size", std::to_string(file.getSize())); + auto & size_file = storage.sizeFile(); + for (auto it = begin; it != end; ++it) + { + auto & column_name = it->first; + auto & file = storage.files[column_name].data_file; + size_file->setString(column_name + ".size", std::to_string(file.getSize())); + } + size_file->save(storage.size_file_path); } +TinyLogBlockOutputStream::~TinyLogBlockOutputStream() +{ + writeSuffix(); +} } From cf1afb1bd137443274dff37c722f5a22cadac2da Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Fri, 1 Aug 2014 17:19:27 +0400 Subject: [PATCH 020/127] dbms: FileChecker moved to separate class [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 71 +++++++++++++++++++ dbms/include/DB/Storages/StorageTinyLog.h | 26 +++---- dbms/src/Storages/StorageTinyLog.cpp | 52 ++++---------- .../queries/0_stateless/00063_check_query.sql | 5 ++ 4 files changed, 99 insertions(+), 55 deletions(-) create mode 100644 dbms/include/DB/Common/FileChecker.h create mode 100644 dbms/tests/queries/0_stateless/00063_check_query.sql diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h new file mode 100644 index 00000000000..a3d9f9c675b --- /dev/null +++ b/dbms/include/DB/Common/FileChecker.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include +#include +#include + +/// хранит размеры всех столбцов, и может проверять не побились ли столбцы +template +class FileChecker +{ +public: + FileChecker(const std::string &file_info_path_, Storage & storage_) : + files_info_path(file_info_path_), files_info(new Poco::Util::XMLConfiguration), storage(storage_), log(&Logger::get("FileChecker")) + { + try + { + files_info->load(files_info_path); + } + catch (Poco::FileNotFoundException & e) + { + files_info->loadEmpty("yandex"); + } + } + + void setPath(const std::string & file_info_path_) + { + files_info_path = file_info_path_; + } + + template + void update(const Iterator & begin, const Iterator & end) + { + for (auto it = begin; it != end; ++it) + { + auto & column_name = *it; + auto & file = storage.getFiles()[column_name].data_file; + files_info->setString(column_name + ".size", std::to_string(file.getSize())); + } + files_info->save(files_info_path); + } + + bool check() const + { + bool sizes_are_correct = true; + for (auto & pair : storage.getFiles()) + { + if (files_info->has(pair.first)) + { + auto & file = pair.second.data_file; + size_t expected_size = std::stoull(files_info->getString(pair.first + ".size")); + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); + sizes_are_correct = false; + } + } + } + return sizes_are_correct; + } + +private: + std::string files_info_path; + + using FileInfo = Poco::AutoPtr; + FileInfo files_info; + + Storage & storage; + Logger * log; +}; diff --git a/dbms/include/DB/Storages/StorageTinyLog.h b/dbms/include/DB/Storages/StorageTinyLog.h index 9be09115b71..d91cc88d9a3 100644 --- a/dbms/include/DB/Storages/StorageTinyLog.h +++ b/dbms/include/DB/Storages/StorageTinyLog.h @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -56,7 +57,6 @@ private: void readData(const String & name, const IDataType & type, IColumn & column, size_t limit, size_t level = 0, bool read_offsets = true); }; - class TinyLogBlockOutputStream : public IBlockOutputStream { public: @@ -94,8 +94,6 @@ private: void addStream(const String & name, const IDataType & type, size_t level = 0); void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns, size_t level = 0); - - void updateFileSizes(const FileStreams::const_iterator & begin, const FileStreams::const_iterator & end); }; @@ -137,6 +135,15 @@ public: bool checkData() const override; + /// Данные столбца + struct ColumnData + { + Poco::File data_file; + }; + typedef std::map Files_t; + + Files_t & getFiles(); + private: String path; String name; @@ -144,20 +151,9 @@ private: size_t max_compress_block_size; - /// Данные столбца - struct ColumnData - { - Poco::File data_file; - }; - typedef std::map Files_t; Files_t files; - std::string size_file_path; - - /// хранит размеры всех столбцов, чтобы проверять не побились ли они - using SizeFile = Poco::AutoPtr; - SizeFile size_file; - SizeFile & sizeFile() { return size_file; } + FileChecker file_checker; Logger * log; diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index 367fd2b1ae0..c1dcdfef474 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -266,7 +266,12 @@ void TinyLogBlockOutputStream::writeSuffix() for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it) it->second->finalize(); - updateFileSizes(streams.begin(), streams.end()); + /// @TODO лишнее копирование. Можно б было использовать boost::transform_iterator, если б он работал с C++11 lambda + std::vector column_names; + for (auto & pair : streams) + column_names.push_back(pair.first); + + storage.file_checker.update(column_names.begin(), column_names.end()); streams.clear(); } @@ -289,7 +294,8 @@ void TinyLogBlockOutputStream::write(const Block & block) StorageTinyLog::StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) : path(path_), name(name_), columns(columns_), - max_compress_block_size(max_compress_block_size_), size_file(new Poco::Util::XMLConfiguration()), + max_compress_block_size(max_compress_block_size_), + file_checker(path + escapeForFileName(name) + '/' + "sizes.txt", *this), log(&Logger::get("StorageTinyLog")) { if (columns->empty()) @@ -305,17 +311,6 @@ StorageTinyLog::StorageTinyLog(const std::string & path_, const std::string & na for (NamesAndTypesList::const_iterator it = columns->begin(); it != columns->end(); ++it) addFile(it->name, *it->type); - - try - { - size_file_path = full_path + "sizes.txt"; - size_file->load(size_file_path); - } - catch (const Poco::FileNotFoundException & e) - { - /// нормальная ситуация, для старых таблиц файла не существует - size_file->loadEmpty("yandex"); - } } StoragePtr StorageTinyLog::create(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) @@ -375,7 +370,7 @@ void StorageTinyLog::rename(const String & new_path_to_db, const String & new_da path = new_path_to_db; name = new_table_name; - size_file_path = path + escapeForFileName(name) + "/" + "sizes.txt"; + file_checker.setPath(path + escapeForFileName(name) + "/" + "sizes.txt"); for (Files_t::iterator it = files.begin(); it != files.end(); ++it) it->second.data_file = Poco::File(path + escapeForFileName(name) + '/' + Poco::Path(it->second.data_file.path()).getFileName()); @@ -412,35 +407,12 @@ void StorageTinyLog::drop() bool StorageTinyLog::checkData() const { - bool sizes_are_correct = true; - for (auto & pair : files) - { - if (size_file->has(pair.first)) - { - auto & file = pair.second.data_file; - size_t expected_size = std::stoull(size_file->getString(pair.first + ".size")); - size_t real_size = file.getSize(); - if (real_size != expected_size) - { - LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); - sizes_are_correct = false; - } - } - } - return sizes_are_correct; + return file_checker.check(); } -void TinyLogBlockOutputStream::updateFileSizes(const FileStreams::const_iterator & begin, - const FileStreams::const_iterator & end) +StorageTinyLog::Files_t & StorageTinyLog::getFiles() { - auto & size_file = storage.sizeFile(); - for (auto it = begin; it != end; ++it) - { - auto & column_name = it->first; - auto & file = storage.files[column_name].data_file; - size_file->setString(column_name + ".size", std::to_string(file.getSize())); - } - size_file->save(storage.size_file_path); + return files; } TinyLogBlockOutputStream::~TinyLogBlockOutputStream() diff --git a/dbms/tests/queries/0_stateless/00063_check_query.sql b/dbms/tests/queries/0_stateless/00063_check_query.sql new file mode 100644 index 00000000000..d2fa620fe39 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00063_check_query.sql @@ -0,0 +1,5 @@ +CREATE TABLE check_query_tiny_log (UInt32 N, String S) Engine = TinyLog; + +INSERT INTO check_query_tiny_log VALUES (1, "A"), (2, "B"), (3, "C") + +DROP TABLE check_query_tiny_log; From b34c8e47abf796fb740f326ae8dc65841d7956a6 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 10:36:24 +0400 Subject: [PATCH 021/127] dbms.StorageLog: added support for checkData query [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 31 ++++++++++------- dbms/include/DB/Storages/StorageLog.h | 33 ++++++++++++------- dbms/src/Storages/StorageLog.cpp | 19 ++++++++++- dbms/src/Storages/StorageTinyLog.cpp | 13 +++++--- .../queries/0_stateless/00063_check_query.sql | 6 ++++ 5 files changed, 73 insertions(+), 29 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index a3d9f9c675b..200104db8a4 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -1,9 +1,15 @@ #pragma once #include +#include #include #include #include +#include +#include + +namespace DB +{ /// хранит размеры всех столбцов, и может проверять не побились ли столбцы template @@ -28,27 +34,26 @@ public: files_info_path = file_info_path_; } - template - void update(const Iterator & begin, const Iterator & end) + using Files = std::vector; + + void update(const Files::iterator & begin, const Files::iterator & end) { for (auto it = begin; it != end; ++it) - { - auto & column_name = *it; - auto & file = storage.getFiles()[column_name].data_file; - files_info->setString(column_name + ".size", std::to_string(file.getSize())); - } + files_info->setString(escapeForFileName(Poco::Path(it->path()).getFileName()) + ".size", std::to_string(it->getSize())); + files_info->save(files_info_path); } - bool check() const + bool check(const Files::iterator & begin, const Files::iterator & end) const { bool sizes_are_correct = true; - for (auto & pair : storage.getFiles()) + for (auto it = begin; it != end; ++it) { - if (files_info->has(pair.first)) + auto & file = *it; + std::string filename = escapeForFileName(Poco::Path(it->path()).getFileName()); + if (files_info->has(filename)) { - auto & file = pair.second.data_file; - size_t expected_size = std::stoull(files_info->getString(pair.first + ".size")); + size_t expected_size = std::stoull(files_info->getString(filename + ".size")); size_t real_size = file.getSize(); if (real_size != expected_size) { @@ -69,3 +74,5 @@ private: Storage & storage; Logger * log; }; + +} diff --git a/dbms/include/DB/Storages/StorageLog.h b/dbms/include/DB/Storages/StorageLog.h index 25bc55155ac..d4fe5d74cd7 100644 --- a/dbms/include/DB/Storages/StorageLog.h +++ b/dbms/include/DB/Storages/StorageLog.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB @@ -80,6 +81,8 @@ class LogBlockOutputStream : public IBlockOutputStream { public: LogBlockOutputStream(StorageLog & storage_); + ~LogBlockOutputStream() { writeSuffix(); } + void write(const Block & block); void writeSuffix(); private: @@ -156,6 +159,22 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name); + /// Данные столбца + struct ColumnData + { + /// Задает номер столбца в файле с засечками. + /// Не обязательно совпадает с номером столбца среди столбцов таблицы: здесь нумеруются также столбцы с длинами массивов. + size_t column_index; + + Poco::File data_file; + Marks marks; + }; + typedef std::map Files_t; + + Files_t & getFiles() { return files; } + + bool checkData() const override; + protected: String path; String name; @@ -195,18 +214,8 @@ protected: unsigned threads = 1); private: - /// Данные столбца - struct ColumnData - { - /// Задает номер столбца в файле с засечками. - /// Не обязательно совпадает с номером столбца среди столбцов таблицы: здесь нумеруются также столбцы с длинами массивов. - size_t column_index; - - Poco::File data_file; - Marks marks; - }; - typedef std::map Files_t; Files_t files; /// name -> data + Names column_names; /// column_index -> name Poco::File marks_file; @@ -218,6 +227,8 @@ private: size_t max_compress_block_size; + FileChecker file_checker; + /** Для обычных столбцов, в засечках указано количество строчек в блоке. * Для столбцов-массивов и вложенных структур, есть более одной группы засечек, соответствующих разным файлам: * - для внутренностей (файла name.bin) - указано суммарное количество элементов массивов в блоке, diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp index 79386b4b3e1..29f25a4a2cb 100644 --- a/dbms/src/Storages/StorageLog.cpp +++ b/dbms/src/Storages/StorageLog.cpp @@ -281,6 +281,13 @@ void LogBlockOutputStream::writeSuffix() for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it) it->second->finalize(); + std::vector column_files; + for (auto & pair : streams) + column_files.push_back(storage.files[pair.first].data_file); + column_files.push_back(storage.marks_file); + + storage.file_checker.update(column_files.begin(), column_files.end()); + streams.clear(); } @@ -403,7 +410,8 @@ void LogBlockOutputStream::writeMarks(MarksForColumns marks) StorageLog::StorageLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, size_t max_compress_block_size_) - : path(path_), name(name_), columns(columns_), loaded_marks(false), max_compress_block_size(max_compress_block_size_) + : path(path_), name(name_), columns(columns_), loaded_marks(false), max_compress_block_size(max_compress_block_size_), + file_checker(path + escapeForFileName(name) + '/' + "sizes.txt", *this) { if (columns->empty()) throw Exception("Empty list of columns passed to StorageLog constructor", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); @@ -532,6 +540,7 @@ void StorageLog::rename(const String & new_path_to_db, const String & new_databa path = new_path_to_db; name = new_table_name; + file_checker.setPath(path + escapeForFileName(name) + '/' + "sizes.txt"); for (Files_t::iterator it = files.begin(); it != files.end(); ++it) { @@ -667,5 +676,13 @@ BlockOutputStreamPtr StorageLog::write( return new LogBlockOutputStream(*this); } +bool StorageLog::checkData() const +{ + std::vector column_files; + for (auto & pair : files) + column_files.push_back(pair.second.data_file); + column_files.push_back(marks_file); + return file_checker.check(column_files.begin(), column_files.end()); +} } diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index c1dcdfef474..d20a46d9078 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -266,12 +266,11 @@ void TinyLogBlockOutputStream::writeSuffix() for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it) it->second->finalize(); - /// @TODO лишнее копирование. Можно б было использовать boost::transform_iterator, если б он работал с C++11 lambda - std::vector column_names; + std::vector column_files; for (auto & pair : streams) - column_names.push_back(pair.first); + column_files.push_back(storage.files[pair.first].data_file); - storage.file_checker.update(column_names.begin(), column_names.end()); + storage.file_checker.update(column_files.begin(), column_files.end()); streams.clear(); } @@ -407,7 +406,11 @@ void StorageTinyLog::drop() bool StorageTinyLog::checkData() const { - return file_checker.check(); + std::vector column_files; + for (auto & pair : files) + column_files.push_back(pair.second.data_file); + + return file_checker.check(column_files.begin(), column_files.end()); } StorageTinyLog::Files_t & StorageTinyLog::getFiles() diff --git a/dbms/tests/queries/0_stateless/00063_check_query.sql b/dbms/tests/queries/0_stateless/00063_check_query.sql index d2fa620fe39..13d5519bf3f 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.sql +++ b/dbms/tests/queries/0_stateless/00063_check_query.sql @@ -3,3 +3,9 @@ CREATE TABLE check_query_tiny_log (UInt32 N, String S) Engine = TinyLog; INSERT INTO check_query_tiny_log VALUES (1, "A"), (2, "B"), (3, "C") DROP TABLE check_query_tiny_log; + +CREATE TABLE check_query_log (UInt32 N, String S) Engine = Log; + +INSERT INTO check_query_log VALUES (1, "A"), (2, "B"), (3, "C") + +DROP TABLE check_query_log; From 08b4b7ff02e70c699827841215dcfc7fc0d9e411 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 12:37:46 +0400 Subject: [PATCH 022/127] FileChecker: changed xml parser from Poco to boost [#METR-11709] Poco::XMLConfiguration couldn't parse nodes containing '%2E' sequences. --- dbms/include/DB/Common/FileChecker.h | 30 +++++++++++++--------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index 200104db8a4..a0cbcf8d752 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -8,6 +8,9 @@ #include #include +#include +#include + namespace DB { @@ -17,16 +20,10 @@ class FileChecker { public: FileChecker(const std::string &file_info_path_, Storage & storage_) : - files_info_path(file_info_path_), files_info(new Poco::Util::XMLConfiguration), storage(storage_), log(&Logger::get("FileChecker")) + files_info_path(file_info_path_), files_info(), storage(storage_), log(&Logger::get("FileChecker")) { - try - { - files_info->load(files_info_path); - } - catch (Poco::FileNotFoundException & e) - { - files_info->loadEmpty("yandex"); - } + if (Poco::File(files_info_path).exists()) + boost::property_tree::read_xml(files_info_path, files_info); } void setPath(const std::string & file_info_path_) @@ -39,9 +36,10 @@ public: void update(const Files::iterator & begin, const Files::iterator & end) { for (auto it = begin; it != end; ++it) - files_info->setString(escapeForFileName(Poco::Path(it->path()).getFileName()) + ".size", std::to_string(it->getSize())); + files_info.put(std::string("yandex.") + escapeForFileName(Poco::Path(it->path()).getFileName()) + ".size", std::to_string(it->getSize())); - files_info->save(files_info_path); + boost::property_tree::write_xml(files_info_path, files_info, std::locale(), + boost::property_tree::xml_parser::xml_writer_settings('\t', 1)); } bool check(const Files::iterator & begin, const Files::iterator & end) const @@ -51,9 +49,10 @@ public: { auto & file = *it; std::string filename = escapeForFileName(Poco::Path(it->path()).getFileName()); - if (files_info->has(filename)) + auto file_size = files_info.get_optional(std::string("yandex.") + filename + ".size"); + if (file_size) { - size_t expected_size = std::stoull(files_info->getString(filename + ".size")); + size_t expected_size = std::stoull(*file_size); size_t real_size = file.getSize(); if (real_size != expected_size) { @@ -68,11 +67,10 @@ public: private: std::string files_info_path; - using FileInfo = Poco::AutoPtr; - FileInfo files_info; + using PropertyTree = boost::property_tree::ptree; + PropertyTree files_info; Storage & storage; Logger * log; }; - } From 2d9f08cb4163ddacd9c69eec971d897c1baa9cce Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 12:38:30 +0400 Subject: [PATCH 023/127] dbms: updated test for check query [#METR-11709] --- .../queries/0_stateless/00063_check_query.reference | 0 dbms/tests/queries/0_stateless/00063_check_query.sql | 12 ++++++------ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00063_check_query.reference diff --git a/dbms/tests/queries/0_stateless/00063_check_query.reference b/dbms/tests/queries/0_stateless/00063_check_query.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00063_check_query.sql b/dbms/tests/queries/0_stateless/00063_check_query.sql index 13d5519bf3f..5e1d01ef913 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.sql +++ b/dbms/tests/queries/0_stateless/00063_check_query.sql @@ -1,11 +1,11 @@ -CREATE TABLE check_query_tiny_log (UInt32 N, String S) Engine = TinyLog; +DROP TABLE IF EXISTS check_query_tiny_log; -INSERT INTO check_query_tiny_log VALUES (1, "A"), (2, "B"), (3, "C") +CREATE TABLE check_query_tiny_log (N UInt32, S String) Engine = TinyLog; -DROP TABLE check_query_tiny_log; +INSERT INTO check_query_tiny_log VALUES (1, 'A'), (2, 'B'), (3, 'C') -CREATE TABLE check_query_log (UInt32 N, String S) Engine = Log; +DROP TABLE IF EXISTS check_query_log; -INSERT INTO check_query_log VALUES (1, "A"), (2, "B"), (3, "C") +CREATE TABLE check_query_log (N UInt32,S String) Engine = Log; -DROP TABLE check_query_log; +INSERT INTO check_query_log VALUES (1, 'A'), (2, 'B'), (3, 'C') From a37e0d42671fe35cbdb7dc5fb59682d8d2674522 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 15:16:20 +0400 Subject: [PATCH 024/127] dbms: FileChecker - refactoring [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 55 ++++++++++++++++-------- dbms/include/DB/Storages/StorageChunks.h | 2 + 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index a0cbcf8d752..2eb88a27846 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -33,13 +33,17 @@ public: using Files = std::vector; + void update(const Poco::File & file) + { + updateTree(file); + saveTree(); + } + void update(const Files::iterator & begin, const Files::iterator & end) { for (auto it = begin; it != end; ++it) - files_info.put(std::string("yandex.") + escapeForFileName(Poco::Path(it->path()).getFileName()) + ".size", std::to_string(it->getSize())); - - boost::property_tree::write_xml(files_info_path, files_info, std::locale(), - boost::property_tree::xml_parser::xml_writer_settings('\t', 1)); + updateTree(*it); + saveTree(); } bool check(const Files::iterator & begin, const Files::iterator & end) const @@ -47,24 +51,41 @@ public: bool sizes_are_correct = true; for (auto it = begin; it != end; ++it) { - auto & file = *it; - std::string filename = escapeForFileName(Poco::Path(it->path()).getFileName()); - auto file_size = files_info.get_optional(std::string("yandex.") + filename + ".size"); - if (file_size) - { - size_t expected_size = std::stoull(*file_size); - size_t real_size = file.getSize(); - if (real_size != expected_size) - { - LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); - sizes_are_correct = false; - } - } + sizes_are_correct &= check(*it); } return sizes_are_correct; } + bool check(const Poco::File & file) const + { + std::string filename = escapeForFileName(Poco::Path(file.path()).getFileName()); + auto file_size = files_info.get_optional(std::string("yandex.") + filename + ".size"); + bool correct = true; + if (file_size) + { + size_t expected_size = std::stoull(*file_size); + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); + correct = false; + } + } + return correct; + } + private: + void updateTree(const Poco::File & file) + { + files_info.put(std::string("yandex.") + escapeForFileName(Poco::Path(file.path()).getFileName()) + ".size", std::to_string(file.getSize())); + } + + void saveTree() + { + boost::property_tree::write_xml(files_info_path, files_info, std::locale(), + boost::property_tree::xml_parser::xml_writer_settings('\t', 1)); + } + std::string files_info_path; using PropertyTree = boost::property_tree::ptree; diff --git a/dbms/include/DB/Storages/StorageChunks.h b/dbms/include/DB/Storages/StorageChunks.h index b3382fe9779..04a186be289 100644 --- a/dbms/include/DB/Storages/StorageChunks.h +++ b/dbms/include/DB/Storages/StorageChunks.h @@ -69,6 +69,8 @@ public: Block getBlockWithVirtualColumns() const; + bool checkData() const override; + protected: /// Виртуальная функция из StorageLog /// По номеру засечки получить имя таблицы, из которой идет чтение и номер последней засечки из этой таблицы. From 8e0a8a4d74d98ebc9079e671bf8982edf1c7c96d Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 15:17:05 +0400 Subject: [PATCH 025/127] dbms: StorageChunks - added support for check query [#METR-11709] --- dbms/include/DB/Storages/StorageLog.h | 2 ++ dbms/src/Storages/StorageChunks.cpp | 15 +++++++++++++++ dbms/src/Storages/StorageLog.cpp | 2 ++ 3 files changed, 19 insertions(+) diff --git a/dbms/include/DB/Storages/StorageLog.h b/dbms/include/DB/Storages/StorageLog.h index d4fe5d74cd7..2cae126ca02 100644 --- a/dbms/include/DB/Storages/StorageLog.h +++ b/dbms/include/DB/Storages/StorageLog.h @@ -227,8 +227,10 @@ private: size_t max_compress_block_size; +protected: FileChecker file_checker; +private: /** Для обычных столбцов, в засечках указано количество строчек в блоке. * Для столбцов-массивов и вложенных структур, есть более одной группы засечек, соответствующих разным файлам: * - для внутренностей (файла name.bin) - указано суммарное количество элементов массивов в блоке, diff --git a/dbms/src/Storages/StorageChunks.cpp b/dbms/src/Storages/StorageChunks.cpp index 5da1dcb9e72..7f8c1b668cb 100644 --- a/dbms/src/Storages/StorageChunks.cpp +++ b/dbms/src/Storages/StorageChunks.cpp @@ -218,6 +218,7 @@ void StorageChunks::appendChunkToIndex(const std::string & chunk_name, size_t ma writeStringBinary(chunk_name, index); writeIntBinary(mark, index); index.next(); + file_checker.update(Poco::File(index_path)); } void StorageChunks::dropThis() @@ -235,4 +236,18 @@ void StorageChunks::dropThis() interpreter.execute(); } +bool StorageChunks::checkData() const +{ + /// Не будем проверять refcount.txt + /// Так как он не влияет на валидность данных + + bool index_file_is_ok; + { + Poco::ScopedReadRWLock lock(const_cast(rwlock)); + String index_path = path + escapeForFileName(name) + "/chunks.chn"; + index_file_is_ok = file_checker.check(Poco::File(index_path)); + } + return DB::StorageLog::checkData() && index_file_is_ok; +} + } diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp index 29f25a4a2cb..40418e95609 100644 --- a/dbms/src/Storages/StorageLog.cpp +++ b/dbms/src/Storages/StorageLog.cpp @@ -678,6 +678,8 @@ BlockOutputStreamPtr StorageLog::write( bool StorageLog::checkData() const { + Poco::ScopedReadRWLock lock(const_cast(rwlock)); + std::vector column_files; for (auto & pair : files) column_files.push_back(pair.second.data_file); From 79e755938ba4b52930b1cc34e3c059a163f09781 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Mon, 4 Aug 2014 15:21:13 +0400 Subject: [PATCH 026/127] dbms: added support of check data to ChunkRef [#METR-11709] --- dbms/include/DB/Storages/StorageChunkRef.h | 2 ++ dbms/src/Storages/StorageChunkRef.cpp | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/dbms/include/DB/Storages/StorageChunkRef.h b/dbms/include/DB/Storages/StorageChunkRef.h index d6e63744828..4b54ea61ecb 100644 --- a/dbms/include/DB/Storages/StorageChunkRef.h +++ b/dbms/include/DB/Storages/StorageChunkRef.h @@ -36,6 +36,8 @@ public: String source_database_name; String source_table_name; + + bool checkData() const override; private: String name; diff --git a/dbms/src/Storages/StorageChunkRef.cpp b/dbms/src/Storages/StorageChunkRef.cpp index 61ba562376c..f84ff574773 100644 --- a/dbms/src/Storages/StorageChunkRef.cpp +++ b/dbms/src/Storages/StorageChunkRef.cpp @@ -82,4 +82,10 @@ const StorageChunks & StorageChunkRef::getSource() const return *chunks; } +bool StorageChunkRef::checkData() const +{ + return getSource().checkData(); +} + + } From 7f7d512f03dab6370be9a2929a43737803803815 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Tue, 5 Aug 2014 14:52:06 +0400 Subject: [PATCH 027/127] dbms: added check query [#METR-11709] --- .../DB/Interpreters/InterpreterCheckQuery.h | 22 ++++++++ dbms/include/DB/Parsers/ASTCheckQuery.h | 25 +++++++++ dbms/include/DB/Parsers/ParserCheckQuery.h | 17 ++++++ dbms/include/DB/Parsers/formatAST.h | 2 + .../Interpreters/InterpreterCheckQuery.cpp | 34 ++++++++++++ dbms/src/Interpreters/InterpreterQuery.cpp | 13 +++++ dbms/src/Parsers/ParserCheckQuery.cpp | 52 +++++++++++++++++++ dbms/src/Parsers/ParserQuery.cpp | 5 +- dbms/src/Parsers/formatAST.cpp | 22 ++++++++ .../0_stateless/00063_check_query.reference | 2 + .../queries/0_stateless/00063_check_query.sql | 5 ++ 11 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 dbms/include/DB/Interpreters/InterpreterCheckQuery.h create mode 100644 dbms/include/DB/Parsers/ASTCheckQuery.h create mode 100644 dbms/include/DB/Parsers/ParserCheckQuery.h create mode 100644 dbms/src/Interpreters/InterpreterCheckQuery.cpp create mode 100644 dbms/src/Parsers/ParserCheckQuery.cpp diff --git a/dbms/include/DB/Interpreters/InterpreterCheckQuery.h b/dbms/include/DB/Interpreters/InterpreterCheckQuery.h new file mode 100644 index 00000000000..48c6092352e --- /dev/null +++ b/dbms/include/DB/Interpreters/InterpreterCheckQuery.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class InterpreterCheckQuery +{ +public: + InterpreterCheckQuery(ASTPtr query_ptr_, Context & context_); + BlockInputStreamPtr execute(); + DB::Block getSampleBlock(); + +private: + ASTPtr query_ptr; + Context context; + DB::Block result; +}; + +} diff --git a/dbms/include/DB/Parsers/ASTCheckQuery.h b/dbms/include/DB/Parsers/ASTCheckQuery.h new file mode 100644 index 00000000000..252f077374d --- /dev/null +++ b/dbms/include/DB/Parsers/ASTCheckQuery.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +namespace DB +{ + +struct ASTCheckQuery : public IAST +{ + ASTCheckQuery(StringRange range_ = StringRange()) : IAST(range_) {}; + ASTCheckQuery(const ASTCheckQuery & ast) = default; + + /** Получить текст, который идентифицирует этот элемент. */ + String getID() const { return ("CheckQuery_" + database + "_" + table); }; + + ASTPtr clone() const + { + return new ASTCheckQuery(*this); + } + + std::string database; + std::string table; +}; + +} diff --git a/dbms/include/DB/Parsers/ParserCheckQuery.h b/dbms/include/DB/Parsers/ParserCheckQuery.h new file mode 100644 index 00000000000..42832ffc5ed --- /dev/null +++ b/dbms/include/DB/Parsers/ParserCheckQuery.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace DB +{ +/** Запрос вида + * CHECK [TABLE] [database.]table + */ +class ParserCheckQuery : public IParserBase +{ +protected: + const char * getName() const { return "ALTER query"; } + bool parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & expected); +}; + +} diff --git a/dbms/include/DB/Parsers/formatAST.h b/dbms/include/DB/Parsers/formatAST.h index 92efb23ffb7..a9be893e06c 100644 --- a/dbms/include/DB/Parsers/formatAST.h +++ b/dbms/include/DB/Parsers/formatAST.h @@ -24,6 +24,7 @@ #include #include #include +#include //#include @@ -57,6 +58,7 @@ void formatAST(const ASTOrderByElement & ast, std::ostream & s, size_t indent = void formatAST(const ASTSubquery & ast, std::ostream & s, size_t indent = 0, bool hilite = true, bool one_line = false, bool need_parens = false); void formatAST(const ASTAlterQuery & ast, std::ostream & s, size_t indent = 0, bool hilite = true, bool one_line = false, bool need_parens = false); void formatAST(const ASTJoin & ast, std::ostream & s, size_t indent = 0, bool hilite = true, bool one_line = false, bool need_parens = false); +void formatAST(const ASTCheckQuery & ast, std::ostream & s, size_t indent = 0, bool hilite = true, bool one_line = false, bool need_parens = false); //void formatAST(const ASTMultiQuery & ast, std::ostream & s, size_t indent = 0, bool hilite = true, bool one_line = false, bool need_parens = false); void formatAST(const ASTQueryWithTableAndOutput & ast, std::string name, std::ostream & s, diff --git a/dbms/src/Interpreters/InterpreterCheckQuery.cpp b/dbms/src/Interpreters/InterpreterCheckQuery.cpp new file mode 100644 index 00000000000..8bbe323c835 --- /dev/null +++ b/dbms/src/Interpreters/InterpreterCheckQuery.cpp @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include + +using namespace DB; + +InterpreterCheckQuery::InterpreterCheckQuery(DB::ASTPtr query_ptr_, DB::Context& context_) : query_ptr(query_ptr_), context(context_) +{ +} + +BlockInputStreamPtr InterpreterCheckQuery::execute() +{ + /// @TODO + ASTCheckQuery & alter = typeid_cast(*query_ptr); + String & table_name = alter.table; + String database_name = alter.database.empty() ? context.getCurrentDatabase() : alter.database; + + StoragePtr table = context.getTable(database_name, table_name); + + result = getSampleBlock(); + result.getByPosition(0).column->insert(Field(UInt64(table->checkData()))); + + return BlockInputStreamPtr(new OneBlockInputStream(result)); +} + +Block InterpreterCheckQuery::getSampleBlock() +{ + DB::Block b; + ColumnPtr column(new ColumnUInt8); + b.insert(ColumnWithNameAndType(column, new DataTypeUInt8, "result")); + return b; +} diff --git a/dbms/src/Interpreters/InterpreterQuery.cpp b/dbms/src/Interpreters/InterpreterQuery.cpp index 8e746e5de99..b952bf7e7e3 100644 --- a/dbms/src/Interpreters/InterpreterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterQuery.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include namespace DB @@ -116,6 +118,11 @@ void InterpreterQuery::execute(WriteBuffer & ostr, ReadBuffer * remaining_data_i InterpreterAlterQuery interpreter(query_ptr, context); interpreter.execute(); } + else if (typeid_cast(&*query_ptr)) + { + InterpreterCheckQuery interpreter(query_ptr, context); + query_plan = interpreter.execute(); + } else throw Exception("Unknown type of query: " + query_ptr->getID(), ErrorCodes::UNKNOWN_TYPE_OF_QUERY); } @@ -203,6 +210,12 @@ BlockIO InterpreterQuery::execute() InterpreterAlterQuery interpreter(query_ptr, context); interpreter.execute(); } + else if (typeid_cast(&*query_ptr)) + { + InterpreterCheckQuery interpreter(query_ptr, context); + res.in = interpreter.execute(); + res.in_sample = interpreter.getSampleBlock(); + } else throw Exception("Unknown type of query: " + query_ptr->getID(), ErrorCodes::UNKNOWN_TYPE_OF_QUERY); diff --git a/dbms/src/Parsers/ParserCheckQuery.cpp b/dbms/src/Parsers/ParserCheckQuery.cpp new file mode 100644 index 00000000000..e5465c625cc --- /dev/null +++ b/dbms/src/Parsers/ParserCheckQuery.cpp @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include + +using namespace DB; + +bool ParserCheckQuery::parseImpl(IParser::Pos& pos, IParser::Pos end, ASTPtr& node, Expected& expected) +{ + ParserWhiteSpaceOrComments ws; + ParserString s_check("CHECK", true, true); + ParserString s_table("TABLE", true, true); + ParserString s_dot("."); + + ParserIdentifier table_parser; + + ASTPtr table; + ASTPtr database; + + Poco::SharedPtr query = new ASTCheckQuery(StringRange(pos, end)); + + ws.ignore(pos, end); + + if (!s_check.ignore(pos, end, expected)) + return false; + + ws.ignore(pos, end); + s_table.ignore(pos, end, expected); + + ws.ignore(pos, end); + if (!table_parser.parse(pos, end, database, expected)) + return false; + + if (s_dot.ignore(pos, end)) + { + if (!table_parser.parse(pos, end, table, expected)) + return false; + + query->database = typeid_cast(*database).name; + query->table = typeid_cast(*table).name; + } + else + { + table = database; + query->table = typeid_cast(*table).name; + } + + node = query; + return true; +} diff --git a/dbms/src/Parsers/ParserQuery.cpp b/dbms/src/Parsers/ParserQuery.cpp index 31d26ac78ac..8a619efece9 100644 --- a/dbms/src/Parsers/ParserQuery.cpp +++ b/dbms/src/Parsers/ParserQuery.cpp @@ -12,6 +12,7 @@ #include #include #include +#include //#include @@ -33,6 +34,7 @@ bool ParserQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & expect ParserOptimizeQuery optimize_p; ParserTablePropertiesQuery table_p; ParserShowProcesslistQuery show_processlist_p; + ParserCheckQuery check_p; // ParserMultiQuery multi_p; bool res = show_tables_p.parse(pos, end, node, expected) @@ -47,10 +49,11 @@ bool ParserQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & expect || optimize_p.parse(pos, end, node, expected) || table_p.parse(pos, end, node, expected) || show_processlist_p.parse(pos, end, node, expected) + || check_p.parse(pos, end, node, expected); /* || multi_p.parse(pos, end, node, expected)*/; if (!res) - expected = "One of: SHOW TABLES, SHOW DATABASES, SHOW CREATE TABLE, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE, EXISTS, DESCRIBE, DESC, ALTER, SHOW PROCESSLIST, opening curly brace"; + expected = "One of: SHOW TABLES, SHOW DATABASES, SHOW CREATE TABLE, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE, EXISTS, DESCRIBE, DESC, ALTER, SHOW PROCESSLIST, CHECK, opening curly brace"; return res; } diff --git a/dbms/src/Parsers/formatAST.cpp b/dbms/src/Parsers/formatAST.cpp index 6b5bf8fba75..51edde02123 100644 --- a/dbms/src/Parsers/formatAST.cpp +++ b/dbms/src/Parsers/formatAST.cpp @@ -70,6 +70,7 @@ void formatAST(const IAST & ast, std::ostream & s, size_t indent, bool hilite, b DISPATCH(AlterQuery) DISPATCH(ShowProcesslistQuery) DISPATCH(Join) + DISPATCH(CheckQuery) // DISPATCH(MultiQuery) else throw Exception("Unknown element in AST: " + ast.getID() @@ -768,6 +769,27 @@ void formatAST(const ASTJoin & ast, std::ostream & s, size_t indent, bool hilite formatAST(*ast.using_expr_list, s, indent, hilite, one_line, need_parens); } +void formatAST(const ASTCheckQuery & ast, std::ostream & s, size_t indent, bool hilite, bool one_line, bool need_parens) +{ + std::string nl_or_nothing = one_line ? "" : "\n"; + + std::string indent_str = one_line ? "" : std::string(4 * indent, ' '); + std::string nl_or_ws = one_line ? " " : "\n"; + + s << (hilite ? hilite_keyword : "") << indent_str << "CHECK TABLE " << (hilite ? hilite_none : ""); + + if (!ast.table.empty()) + { + if (!ast.database.empty()) + { + s << (hilite ? hilite_keyword : "") << indent_str << ast.database << (hilite ? hilite_none : ""); + s << "."; + } + s << (hilite ? hilite_keyword : "") << indent_str << ast.table << (hilite ? hilite_none : ""); + } + s << nl_or_ws; +} + /* void formatAST(const ASTMultiQuery & ast, std::ostream & s, size_t indent, bool hilite, bool one_line, bool need_parens) { diff --git a/dbms/tests/queries/0_stateless/00063_check_query.reference b/dbms/tests/queries/0_stateless/00063_check_query.reference index e69de29bb2d..6ed281c757a 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.reference +++ b/dbms/tests/queries/0_stateless/00063_check_query.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/dbms/tests/queries/0_stateless/00063_check_query.sql b/dbms/tests/queries/0_stateless/00063_check_query.sql index 5e1d01ef913..9eb617475a4 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.sql +++ b/dbms/tests/queries/0_stateless/00063_check_query.sql @@ -4,8 +4,13 @@ CREATE TABLE check_query_tiny_log (N UInt32, S String) Engine = TinyLog; INSERT INTO check_query_tiny_log VALUES (1, 'A'), (2, 'B'), (3, 'C') +CHECK TABLE check_query_tiny_log; + + DROP TABLE IF EXISTS check_query_log; CREATE TABLE check_query_log (N UInt32,S String) Engine = Log; INSERT INTO check_query_log VALUES (1, 'A'), (2, 'B'), (3, 'C') + +CHECK TABLE check_query_log; From bdea8e78e70391bb52ffe948f704d568054f0bb9 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Tue, 5 Aug 2014 16:50:20 +0400 Subject: [PATCH 028/127] dbms: FileChecker simplified [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 31 ++++++++++++------------ dbms/include/DB/Storages/StorageChunks.h | 2 -- dbms/src/Storages/StorageChunks.cpp | 14 ----------- dbms/src/Storages/StorageLog.cpp | 6 +---- dbms/src/Storages/StorageTinyLog.cpp | 6 +---- 5 files changed, 17 insertions(+), 42 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index 2eb88a27846..fdae6f82e5f 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -46,28 +46,27 @@ public: saveTree(); } - bool check(const Files::iterator & begin, const Files::iterator & end) const + /// Проверяем файлы, параметры которых указаны в sizes.txt + bool check() const { - bool sizes_are_correct = true; - for (auto it = begin; it != end; ++it) - { - sizes_are_correct &= check(*it); - } - return sizes_are_correct; - } - - bool check(const Poco::File & file) const - { - std::string filename = escapeForFileName(Poco::Path(file.path()).getFileName()); - auto file_size = files_info.get_optional(std::string("yandex.") + filename + ".size"); bool correct = true; - if (file_size) + for (auto & node : files_info.get_child("yandex")) { - size_t expected_size = std::stoull(*file_size); + std::string filename = unescapeForFileName(node.first); + size_t expected_size = std::stoull(node.second.get("size")); + + Poco::File file(Poco::Path(files_info_path).parent().toString() + "/" + filename); + if (!file.exists()) + { + LOG_ERROR(log, "File " << file.path() << " doesn't exists"); + correct = false; + continue; + } + size_t real_size = file.getSize(); if (real_size != expected_size) { - LOG_ERROR(log, "Size of " << file.path() << "is wrong. Size is " << real_size << " but should be " << expected_size); + LOG_ERROR(log, "Size of " << file.path() << " is wrong. Size is " << real_size << " but should be " << expected_size); correct = false; } } diff --git a/dbms/include/DB/Storages/StorageChunks.h b/dbms/include/DB/Storages/StorageChunks.h index 04a186be289..b3382fe9779 100644 --- a/dbms/include/DB/Storages/StorageChunks.h +++ b/dbms/include/DB/Storages/StorageChunks.h @@ -69,8 +69,6 @@ public: Block getBlockWithVirtualColumns() const; - bool checkData() const override; - protected: /// Виртуальная функция из StorageLog /// По номеру засечки получить имя таблицы, из которой идет чтение и номер последней засечки из этой таблицы. diff --git a/dbms/src/Storages/StorageChunks.cpp b/dbms/src/Storages/StorageChunks.cpp index 7f8c1b668cb..f8bc1b8ad1e 100644 --- a/dbms/src/Storages/StorageChunks.cpp +++ b/dbms/src/Storages/StorageChunks.cpp @@ -236,18 +236,4 @@ void StorageChunks::dropThis() interpreter.execute(); } -bool StorageChunks::checkData() const -{ - /// Не будем проверять refcount.txt - /// Так как он не влияет на валидность данных - - bool index_file_is_ok; - { - Poco::ScopedReadRWLock lock(const_cast(rwlock)); - String index_path = path + escapeForFileName(name) + "/chunks.chn"; - index_file_is_ok = file_checker.check(Poco::File(index_path)); - } - return DB::StorageLog::checkData() && index_file_is_ok; -} - } diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp index 40418e95609..e0045e3b72a 100644 --- a/dbms/src/Storages/StorageLog.cpp +++ b/dbms/src/Storages/StorageLog.cpp @@ -680,11 +680,7 @@ bool StorageLog::checkData() const { Poco::ScopedReadRWLock lock(const_cast(rwlock)); - std::vector column_files; - for (auto & pair : files) - column_files.push_back(pair.second.data_file); - column_files.push_back(marks_file); - return file_checker.check(column_files.begin(), column_files.end()); + return file_checker.check(); } } diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index d20a46d9078..0ca5c4fbf3a 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -406,11 +406,7 @@ void StorageTinyLog::drop() bool StorageTinyLog::checkData() const { - std::vector column_files; - for (auto & pair : files) - column_files.push_back(pair.second.data_file); - - return file_checker.check(column_files.begin(), column_files.end()); + return file_checker.check(); } StorageTinyLog::Files_t & StorageTinyLog::getFiles() From c399e1d14a6f5744d9255b73d1c12245ae844d46 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Tue, 5 Aug 2014 16:54:10 +0400 Subject: [PATCH 029/127] fix build. Stupid kdevelop! --- dbms/src/Parsers/ParserCheckQuery.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/dbms/src/Parsers/ParserCheckQuery.cpp b/dbms/src/Parsers/ParserCheckQuery.cpp index e5465c625cc..f16165be5c8 100644 --- a/dbms/src/Parsers/ParserCheckQuery.cpp +++ b/dbms/src/Parsers/ParserCheckQuery.cpp @@ -3,7 +3,6 @@ #include #include #include -#include using namespace DB; From e4dbe3c2426bb9ab3c2618d6d6570a0e5acea960 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Tue, 5 Aug 2014 17:41:07 +0400 Subject: [PATCH 030/127] FileChecker: return true if no sizes.txt file exists [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 35 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index fdae6f82e5f..194ac1effad 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -50,26 +50,27 @@ public: bool check() const { bool correct = true; - for (auto & node : files_info.get_child("yandex")) - { - std::string filename = unescapeForFileName(node.first); - size_t expected_size = std::stoull(node.second.get("size")); - - Poco::File file(Poco::Path(files_info_path).parent().toString() + "/" + filename); - if (!file.exists()) + if (!files_info.empty()) + for (auto & node : files_info.get_child("yandex")) { - LOG_ERROR(log, "File " << file.path() << " doesn't exists"); - correct = false; - continue; - } + std::string filename = unescapeForFileName(node.first); + size_t expected_size = std::stoull(node.second.get("size")); - size_t real_size = file.getSize(); - if (real_size != expected_size) - { - LOG_ERROR(log, "Size of " << file.path() << " is wrong. Size is " << real_size << " but should be " << expected_size); - correct = false; + Poco::File file(Poco::Path(files_info_path).parent().toString() + "/" + filename); + if (!file.exists()) + { + LOG_ERROR(log, "File " << file.path() << " doesn't exists"); + correct = false; + continue; + } + + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << " is wrong. Size is " << real_size << " but should be " << expected_size); + correct = false; + } } - } return correct; } From 917f312a95f269e68c52cac6f572dcb7828f4452 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 5 Aug 2014 14:56:58 +0400 Subject: [PATCH 031/127] dbms: materialized views pass virtual columns through. [#METR-12150] --- .../include/DB/Storages/StorageMaterializedView.h | 3 +++ dbms/src/Storages/StorageMaterializedView.cpp | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/dbms/include/DB/Storages/StorageMaterializedView.h b/dbms/include/DB/Storages/StorageMaterializedView.h index bbe2807270f..4cea9d804e0 100644 --- a/dbms/include/DB/Storages/StorageMaterializedView.h +++ b/dbms/include/DB/Storages/StorageMaterializedView.h @@ -15,6 +15,9 @@ public: std::string getName() const { return "MaterializedView"; } std::string getInnerTableName() const { return ".inner." + table_name; } + NameAndTypePair getColumn(const String &column_name) const; + bool hasColumn(const String &column_name) const; + BlockOutputStreamPtr write(ASTPtr query); void drop() override; bool optimize(); diff --git a/dbms/src/Storages/StorageMaterializedView.cpp b/dbms/src/Storages/StorageMaterializedView.cpp index e288db8d048..74666d65e23 100644 --- a/dbms/src/Storages/StorageMaterializedView.cpp +++ b/dbms/src/Storages/StorageMaterializedView.cpp @@ -5,6 +5,7 @@ #include #include +#include namespace DB @@ -61,6 +62,20 @@ StorageMaterializedView::StorageMaterializedView(const String & table_name_, con } } +NameAndTypePair StorageMaterializedView::getColumn(const String & column_name) const +{ + auto type = VirtualColumnFactory::tryGetType(column_name); + if (type) + return NameAndTypePair(column_name, type); + + return getRealColumn(column_name); +} + +bool StorageMaterializedView::hasColumn(const String & column_name) const +{ + return VirtualColumnFactory::hasColumn(column_name) || hasRealColumn(column_name); +} + BlockInputStreams StorageMaterializedView::read( const Names & column_names, ASTPtr query, From 306f4059179ee430da67ea2ab63dff1af7ec5ff5 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 6 Aug 2014 13:25:38 +0400 Subject: [PATCH 032/127] Merge --- .../MergeTree/MergeTreePartChecker.cpp | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp index b7055da37f4..8dc761b26a4 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp @@ -103,27 +103,37 @@ struct Stream readIntBinary(mrk_mark.offset_in_compressed_file, mrk_hashing_buf); readIntBinary(mrk_mark.offset_in_decompressed_block, mrk_hashing_buf); + bool has_alternative_mark = false; + MarkInCompressedFile alternative_data_mark; MarkInCompressedFile data_mark; + /// Если засечка должна быть ровно на границе блоков, нам подходит и засечка, указывающая на конец предыдущего блока, + /// и на начало следующего. if (uncompressed_hashing_buf.position() == uncompressed_hashing_buf.buffer().end()) { - /// Если засечка должна быть ровно на границе блоков, нам подходит и засечка, указывающая на конец предыдущего блока, - /// и на начало следующего. - data_mark.offset_in_compressed_file = compressed_hashing_buf.count() - uncompressing_buf.getSizeCompressed(); - data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset(); + /// Получим засечку, указывающую на конец предыдущего блока. + has_alternative_mark = true; + alternative_data_mark.offset_in_compressed_file = compressed_hashing_buf.count() - uncompressing_buf.getSizeCompressed(); + alternative_data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset(); - if (mrk_mark == data_mark) + if (mrk_mark == alternative_data_mark) return; uncompressed_hashing_buf.next(); + + /// В конце файла compressed_hashing_buf.count() указывает на конец файла даже до вызова next(), + /// и только что выполненная проверка работает неправильно. Для простоты не будем проверять последнюю засечку. + if (uncompressed_hashing_buf.eof()) + return; } data_mark.offset_in_compressed_file = compressed_hashing_buf.count() - uncompressing_buf.getSizeCompressed(); data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset(); if (mrk_mark != data_mark) - throw Exception("Incorrect mark: " + data_mark.toString() + " in data, " + mrk_mark.toString() + " in .mrk file", - ErrorCodes::INCORRECT_MARK); + throw Exception("Incorrect mark: " + data_mark.toString() + + (has_alternative_mark ? " or " + alternative_data_mark.toString() : "") + " in data, " + + mrk_mark.toString() + " in .mrk file", ErrorCodes::INCORRECT_MARK); } void assertEnd(MergeTreeData::DataPart::Checksums & checksums) From dc16847ce112192db40619de66eb62b8c47d7851 Mon Sep 17 00:00:00 2001 From: Pavel Kartavyy Date: Wed, 6 Aug 2014 17:22:52 +0400 Subject: [PATCH 033/127] dbms: FileChecker migrated to json [#METR-11709] --- dbms/include/DB/Common/FileChecker.h | 60 ++++++++++++++-------------- dbms/src/Storages/StorageLog.cpp | 4 +- dbms/src/Storages/StorageTinyLog.cpp | 4 +- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/dbms/include/DB/Common/FileChecker.h b/dbms/include/DB/Common/FileChecker.h index 194ac1effad..8d9f1b1a741 100644 --- a/dbms/include/DB/Common/FileChecker.h +++ b/dbms/include/DB/Common/FileChecker.h @@ -7,9 +7,8 @@ #include #include #include - -#include -#include +#include +#include namespace DB { @@ -20,10 +19,10 @@ class FileChecker { public: FileChecker(const std::string &file_info_path_, Storage & storage_) : - files_info_path(file_info_path_), files_info(), storage(storage_), log(&Logger::get("FileChecker")) + files_info_path(file_info_path_), storage(storage_), log(&Logger::get("FileChecker")) { - if (Poco::File(files_info_path).exists()) - boost::property_tree::read_xml(files_info_path, files_info); + std::ifstream istr(files_info_path); + files_info.parse(istr); } void setPath(const std::string & file_info_path_) @@ -46,50 +45,49 @@ public: saveTree(); } - /// Проверяем файлы, параметры которых указаны в sizes.txt + /// Проверяем файлы, параметры которых указаны в sizes.json bool check() const { bool correct = true; - if (!files_info.empty()) - for (auto & node : files_info.get_child("yandex")) + for (auto & node : files_info.kv_map()) + { + std::string filename = unescapeForFileName(node.first); + size_t expected_size = std::stoull(node.second->get().get("size")); + + Poco::File file(Poco::Path(files_info_path).parent().toString() + "/" + filename); + if (!file.exists()) { - std::string filename = unescapeForFileName(node.first); - size_t expected_size = std::stoull(node.second.get("size")); - - Poco::File file(Poco::Path(files_info_path).parent().toString() + "/" + filename); - if (!file.exists()) - { - LOG_ERROR(log, "File " << file.path() << " doesn't exists"); - correct = false; - continue; - } - - size_t real_size = file.getSize(); - if (real_size != expected_size) - { - LOG_ERROR(log, "Size of " << file.path() << " is wrong. Size is " << real_size << " but should be " << expected_size); - correct = false; - } + LOG_ERROR(log, "File " << file.path() << " doesn't exists"); + correct = false; + continue; } + + size_t real_size = file.getSize(); + if (real_size != expected_size) + { + LOG_ERROR(log, "Size of " << file.path() << " is wrong. Size is " << real_size << " but should be " << expected_size); + correct = false; + } + } return correct; } private: void updateTree(const Poco::File & file) { - files_info.put(std::string("yandex.") + escapeForFileName(Poco::Path(file.path()).getFileName()) + ".size", std::to_string(file.getSize())); + files_info.import(escapeForFileName(Poco::Path(file.path()).getFileName()), + jsonxx::Object("size", std::to_string(file.getSize()))); } void saveTree() { - boost::property_tree::write_xml(files_info_path, files_info, std::locale(), - boost::property_tree::xml_parser::xml_writer_settings('\t', 1)); + std::ofstream file(files_info_path, std::ofstream::trunc); + file << files_info.write(jsonxx::JSON); } std::string files_info_path; - using PropertyTree = boost::property_tree::ptree; - PropertyTree files_info; + jsonxx::Object files_info; Storage & storage; Logger * log; diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp index e0045e3b72a..6a7cccc7de7 100644 --- a/dbms/src/Storages/StorageLog.cpp +++ b/dbms/src/Storages/StorageLog.cpp @@ -411,7 +411,7 @@ void LogBlockOutputStream::writeMarks(MarksForColumns marks) StorageLog::StorageLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, size_t max_compress_block_size_) : path(path_), name(name_), columns(columns_), loaded_marks(false), max_compress_block_size(max_compress_block_size_), - file_checker(path + escapeForFileName(name) + '/' + "sizes.txt", *this) + file_checker(path + escapeForFileName(name) + '/' + "sizes.json", *this) { if (columns->empty()) throw Exception("Empty list of columns passed to StorageLog constructor", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); @@ -540,7 +540,7 @@ void StorageLog::rename(const String & new_path_to_db, const String & new_databa path = new_path_to_db; name = new_table_name; - file_checker.setPath(path + escapeForFileName(name) + '/' + "sizes.txt"); + file_checker.setPath(path + escapeForFileName(name) + '/' + "sizes.json"); for (Files_t::iterator it = files.begin(); it != files.end(); ++it) { diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp index 0ca5c4fbf3a..d48cbea70e3 100644 --- a/dbms/src/Storages/StorageTinyLog.cpp +++ b/dbms/src/Storages/StorageTinyLog.cpp @@ -294,7 +294,7 @@ void TinyLogBlockOutputStream::write(const Block & block) StorageTinyLog::StorageTinyLog(const std::string & path_, const std::string & name_, NamesAndTypesListPtr columns_, bool attach, size_t max_compress_block_size_) : path(path_), name(name_), columns(columns_), max_compress_block_size(max_compress_block_size_), - file_checker(path + escapeForFileName(name) + '/' + "sizes.txt", *this), + file_checker(path + escapeForFileName(name) + '/' + "sizes.json", *this), log(&Logger::get("StorageTinyLog")) { if (columns->empty()) @@ -369,7 +369,7 @@ void StorageTinyLog::rename(const String & new_path_to_db, const String & new_da path = new_path_to_db; name = new_table_name; - file_checker.setPath(path + escapeForFileName(name) + "/" + "sizes.txt"); + file_checker.setPath(path + escapeForFileName(name) + "/" + "sizes.json"); for (Files_t::iterator it = files.begin(); it != files.end(); ++it) it->second.data_file = Poco::File(path + escapeForFileName(name) + '/' + Poco::Path(it->second.data_file.path()).getFileName()); From 98297fa475ac77da40ac70d49dbdd2192733edb2 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 5 Aug 2014 17:49:44 +0400 Subject: [PATCH 034/127] Merge --- .../DB/Storages/StorageReplicatedMergeTree.h | 30 +-- .../Storages/StorageReplicatedMergeTree.cpp | 182 +++++++++++++----- 2 files changed, 150 insertions(+), 62 deletions(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 5f82e157a82..f86f6c91f4d 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -111,7 +111,7 @@ private: { try { - Poco::ScopedLock lock(storage.queue_mutex); + std::unique_lock lock(storage.queue_mutex); if (!storage.future_parts.erase(part)) throw Exception("Untagging already untagged future part " + part + ". This is a bug.", ErrorCodes::LOGICAL_ERROR); } @@ -126,25 +126,29 @@ private: struct LogEntry { + typedef Poco::SharedPtr Ptr; + enum Type { - GET_PART, - MERGE_PARTS, + GET_PART, /// Получить кусок с другой реплики. + MERGE_PARTS, /// Слить куски. + DROP_RANGE, /// Удалить куски в указанном месяце в указанном диапазоне номеров. }; String znode_name; Type type; String source_replica; /// Пустая строка значит, что эта запись была добавлена сразу в очередь, а не скопирована из лога. - String new_part_name; + String new_part_name; /// Для DROP_RANGE имя несуществующего куска. Нужно удалить все куски, покрытые им. Strings parts_to_merge; FuturePartTaggerPtr future_part_tagger; - bool currently_executing = false; + bool currently_executing = false; /// Доступ под queue_mutex. + std::condition_variable execution_complete; /// Пробуждается когда currently_executing становится false. void addResultToVirtualParts(StorageReplicatedMergeTree & storage) { - if (type == MERGE_PARTS || type == GET_PART) + if (type == MERGE_PARTS || type == GET_PART || type == DROP_RANGE) storage.virtual_parts.add(new_part_name); } @@ -167,17 +171,19 @@ private: return s; } - static LogEntry parse(const String & s) + static Ptr parse(const String & s) { ReadBufferFromString in(s); - LogEntry res; - res.readText(in); + Ptr res = new LogEntry; + res->readText(in); assertEOF(in); return res; } }; - typedef std::list LogEntries; + typedef LogEntry::Ptr LogEntryPtr; + + typedef std::list LogEntries; typedef std::set StringSet; typedef std::list StringList; @@ -195,7 +201,7 @@ private: * В ZK записи в хронологическом порядке. Здесь - не обязательно. */ LogEntries queue; - Poco::FastMutex queue_mutex; + std::mutex queue_mutex; /** Куски, которые появятся в результате действий, выполняемых прямо сейчас фоновыми потоками (этих действий нет в очереди). * Использовать под залоченным queue_mutex. @@ -381,6 +387,8 @@ private: */ bool executeLogEntry(const LogEntry & entry, BackgroundProcessingPool::Context & pool_context); + bool executeDropRange(const LogEntry & entry); + /** Обновляет очередь. */ void queueUpdatingThread(); diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 8ded9c49f96..d79e13b7004 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -673,23 +673,23 @@ void StorageReplicatedMergeTree::clearOldBlocks() void StorageReplicatedMergeTree::loadQueue() { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); Strings children = zookeeper->getChildren(replica_path + "/queue"); std::sort(children.begin(), children.end()); for (const String & child : children) { String s = zookeeper->get(replica_path + "/queue/" + child); - LogEntry entry = LogEntry::parse(s); - entry.znode_name = child; - entry.addResultToVirtualParts(*this); + LogEntryPtr entry = LogEntry::parse(s); + entry->znode_name = child; + entry->addResultToVirtualParts(*this); queue.push_back(entry); } } void StorageReplicatedMergeTree::pullLogsToQueue(zkutil::EventPtr next_update_event) { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); String index_str = zookeeper->get(replica_path + "/log_pointer"); UInt64 index; @@ -716,7 +716,7 @@ void StorageReplicatedMergeTree::pullLogsToQueue(zkutil::EventPtr next_update_ev ++count; ++index; - LogEntry entry = LogEntry::parse(entry_str); + LogEntryPtr entry = LogEntry::parse(entry_str); /// Одновременно добавим запись в очередь и продвинем указатель на лог. zkutil::Ops ops; @@ -727,8 +727,8 @@ void StorageReplicatedMergeTree::pullLogsToQueue(zkutil::EventPtr next_update_ev auto results = zookeeper->multi(ops); String path_created = dynamic_cast(ops[0]).getPathCreated(); - entry.znode_name = path_created.substr(path_created.find_last_of('/') + 1); - entry.addResultToVirtualParts(*this); + entry->znode_name = path_created.substr(path_created.find_last_of('/') + 1); + entry->addResultToVirtualParts(*this); queue.push_back(entry); } @@ -778,6 +778,9 @@ bool StorageReplicatedMergeTree::shouldExecuteLogEntry(const LogEntry & entry) bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, BackgroundProcessingPool::Context & pool_context) { + if (entry.type == LogEntry::DROP_RANGE) + return executeDropRange(entry); + if (entry.type == LogEntry::GET_PART || entry.type == LogEntry::MERGE_PARTS) { @@ -893,19 +896,19 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro */ try { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); /// Найдем действие по объединению этого куска с другими. Запомним других. StringSet parts_for_merge; LogEntries::iterator merge_entry; for (LogEntries::iterator it = queue.begin(); it != queue.end(); ++it) { - if (it->type == LogEntry::MERGE_PARTS) + if ((*it)->type == LogEntry::MERGE_PARTS) { - if (std::find(it->parts_to_merge.begin(), it->parts_to_merge.end(), entry.new_part_name) - != it->parts_to_merge.end()) + if (std::find((*it)->parts_to_merge.begin(), (*it)->parts_to_merge.end(), entry.new_part_name) + != (*it)->parts_to_merge.end()) { - parts_for_merge = StringSet(it->parts_to_merge.begin(), it->parts_to_merge.end()); + parts_for_merge = StringSet((*it)->parts_to_merge.begin(), (*it)->parts_to_merge.end()); merge_entry = it; break; } @@ -923,8 +926,8 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro if (it0 == merge_entry) break; - if ((it0->type == LogEntry::MERGE_PARTS || it0->type == LogEntry::GET_PART) - && parts_for_merge.count(it0->new_part_name)) + if (((*it0)->type == LogEntry::MERGE_PARTS || (*it0)->type == LogEntry::GET_PART) + && parts_for_merge.count((*it0)->new_part_name)) { queue.splice(queue.end(), queue, it0, it); } @@ -956,6 +959,71 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro return true; } +bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTree::LogEntry & entry) +{ + LOG_INFO(log, "Removing parts covered by " << entry.new_part_name << "."); + + { + LogEntries to_wait; + size_t removed_entries = 0; + + /// Удалим из очереди операции с кусками, содержащимися в удаляемом диапазоне. + std::unique_lock lock(queue_mutex); + for (LogEntries::iterator it = queue.begin(); it != queue.end();) + { + if (((*it)->type == LogEntry::GET_PART || (*it)->type == LogEntry::MERGE_PARTS) && + ActiveDataPartSet::contains(entry.new_part_name, (*it)->new_part_name)) + { + if ((*it)->currently_executing) + to_wait.push_back(*it); + auto code = zookeeper->tryRemove(replica_path + "/queue/" + (*it)->znode_name); + if (code != ZOK) + LOG_INFO(log, "Couldn't remove " << replica_path + "/queue/" + (*it)->znode_name << ": " + << zkutil::ZooKeeper::error2string(code)); + queue.erase(it++); + ++removed_entries; + } + else + ++it; + } + + LOG_DEBUG(log, "Removed " << removed_entries << " entries from queue. " + "Waiting for " << to_wait.size() << " entries that are currently executing."); + + /// Дождемся завершения операций с кусками, содержащимися в удаляемом диапазоне. + { + std::unique_lock lock(queue_mutex); + for (LogEntryPtr & entry : to_wait) + entry->execution_complete.wait(lock, [&entry] { return !entry->currently_executing; }); + } + } + + LOG_DEBUG(log, "Removing parts."); + size_t removed_parts = 0; + + /// Удалим куски, содержащиеся в удаляемом диапазоне. + auto parts = data.getDataParts(); + for (const auto & part : parts) + { + if (!ActiveDataPartSet::contains(entry.new_part_name, part->name)) + continue; + LOG_DEBUG(log, "Removing part " << part->name); + ++removed_parts; + + zkutil::Ops ops; + ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name + "/columns", -1)); + ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name + "/checksums", -1)); + ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name, -1)); + zookeeper->multi(ops); + + data.replaceParts({part}, {}, false); + } + + LOG_INFO(log, "Finished removing parts covered by " << entry.new_part_name << "."); + + return true; +} + void StorageReplicatedMergeTree::queueUpdatingThread() { while (!shutdown_called) @@ -979,24 +1047,22 @@ void StorageReplicatedMergeTree::queueUpdatingThread() bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & pool_context) { - LogEntry entry; - bool have_work = false; + LogEntryPtr entry; try { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); bool empty = queue.empty(); if (!empty) { for (LogEntries::iterator it = queue.begin(); it != queue.end(); ++it) { - if (!it->currently_executing && shouldExecuteLogEntry(*it)) + if (!(*it)->currently_executing && shouldExecuteLogEntry(**it)) { entry = *it; - entry.tagPartAsFuture(*this); + entry->tagPartAsFuture(*this); queue.splice(queue.end(), queue, it); - it->currently_executing = true; - have_work = true; + entry->currently_executing = true; break; } } @@ -1007,7 +1073,7 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p tryLogCurrentException(__PRETTY_FUNCTION__); } - if (!have_work) + if (!entry) return false; bool exception = true; @@ -1015,12 +1081,12 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p try { - if (executeLogEntry(entry, pool_context)) + if (executeLogEntry(*entry, pool_context)) { - auto code = zookeeper->tryRemove(replica_path + "/queue/" + entry.znode_name); + auto code = zookeeper->tryRemove(replica_path + "/queue/" + entry->znode_name); if (code != ZOK) - LOG_ERROR(log, "Couldn't remove " << replica_path + "/queue/" + entry.znode_name << ": " + LOG_ERROR(log, "Couldn't remove " << replica_path + "/queue/" + entry->znode_name << ": " << zkutil::ZooKeeper::error2string(code) + ". This shouldn't happen often."); success = true; @@ -1041,20 +1107,25 @@ bool StorageReplicatedMergeTree::queueTask(BackgroundProcessingPool::Context & p tryLogCurrentException(__PRETTY_FUNCTION__); } - /// Удалим задание из очереди или отметим, что мы его больше не выполняем. - /// Нельзя просто обратиться по заранее сохраненному итератору, потому что задание мог успеть удалить кто-то другой. - entry.future_part_tagger = nullptr; - Poco::ScopedLock lock(queue_mutex); - for (LogEntries::iterator it = queue.end(); it != queue.begin();) + entry->future_part_tagger = nullptr; + + std::unique_lock lock(queue_mutex); + + entry->currently_executing = false; + entry->execution_complete.notify_all(); + + if (success) { - --it; - if (it->znode_name == entry.znode_name) + /// Удалим задание из очереди. + /// Нельзя просто обратиться по заранее сохраненному итератору, потому что задание мог успеть удалить кто-то другой. + for (LogEntries::iterator it = queue.end(); it != queue.begin();) { - if (success) + --it; + if (*it == entry) + { queue.erase(it); - else - it->currently_executing = false; - break; + break; + } } } @@ -1086,17 +1157,17 @@ void StorageReplicatedMergeTree::mergeSelectingThread() if (!has_big_merge) { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); for (const auto & entry : queue) { - if (entry.type == LogEntry::MERGE_PARTS) + if (entry->type == LogEntry::MERGE_PARTS) { ++merges_queued; if (!has_big_merge) { - for (const String & name : entry.parts_to_merge) + for (const String & name : entry->parts_to_merge) { MergeTreeData::DataPartPtr part = data.getActiveContainingPart(name); if (!part || part->name != name) @@ -1338,14 +1409,14 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n { String part_path = replica_path + "/parts/" + part_name; - LogEntry log_entry; - log_entry.type = LogEntry::GET_PART; - log_entry.source_replica = ""; - log_entry.new_part_name = part_name; + LogEntryPtr log_entry = new LogEntry; + log_entry->type = LogEntry::GET_PART; + log_entry->source_replica = ""; + log_entry->new_part_name = part_name; zkutil::Ops ops; ops.push_back(new zkutil::Op::Create( - replica_path + "/queue/queue-", log_entry.toString(), zookeeper->getDefaultACL(), + replica_path + "/queue/queue-", log_entry->toString(), zookeeper->getDefaultACL(), zkutil::CreateMode::PersistentSequential)); ops.push_back(new zkutil::Op::Remove(part_path + "/checksums", -1)); ops.push_back(new zkutil::Op::Remove(part_path + "/columns", -1)); @@ -1353,11 +1424,11 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n auto results = zookeeper->multi(ops); { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); String path_created = dynamic_cast(ops[0]).getPathCreated(); - log_entry.znode_name = path_created.substr(path_created.find_last_of('/') + 1); - log_entry.addResultToVirtualParts(*this); + log_entry->znode_name = path_created.substr(path_created.find_last_of('/') + 1); + log_entry->addResultToVirtualParts(*this); queue.push_back(log_entry); } } @@ -1462,13 +1533,13 @@ void StorageReplicatedMergeTree::partCheckThread() bool was_in_queue = false; { - Poco::ScopedLock lock(queue_mutex); + std::unique_lock lock(queue_mutex); for (LogEntries::iterator it = queue.begin(); it != queue.end(); ) { - if (it->new_part_name == part_name) + if ((*it)->new_part_name == part_name) { - zookeeper->tryRemove(replica_path + "/queue/" + it->znode_name); + zookeeper->tryRemove(replica_path + "/queue/" + (*it)->znode_name); queue.erase(it++); was_in_queue = true; } @@ -2065,6 +2136,10 @@ void StorageReplicatedMergeTree::LogEntry::writeText(WriteBuffer & out) const writeString("into\n", out); writeString(new_part_name, out); break; + case DROP_RANGE: + writeString("drop\n", out); + writeString(new_part_name, out); + break; } writeString("\n", out); } @@ -2099,6 +2174,11 @@ void StorageReplicatedMergeTree::LogEntry::readText(ReadBuffer & in) } readString(new_part_name, in); } + else if (type_str == "drop") + { + type = DROP_RANGE; + readString(new_part_name, in); + } assertString("\n", in); } From e76a00fbd24619f2504bbfbfd9e0badd2eed0746 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 6 Aug 2014 13:24:30 +0400 Subject: [PATCH 035/127] dbms: parsing and formatting ALTER TABLE DROP PARTITION. Not interpreting it yet. [#METR-12037] --- dbms/include/DB/Parsers/ASTAlterQuery.h | 29 +++++++++++--- dbms/include/DB/Parsers/ParserAlterQuery.h | 1 + dbms/include/DB/Storages/AlterCommands.h | 2 +- dbms/include/DB/Storages/IStorage.h | 7 ++++ .../Interpreters/InterpreterAlterQuery.cpp | 6 +-- dbms/src/Parsers/ParserAlterQuery.cpp | 40 ++++++++++++++----- dbms/src/Parsers/formatAST.cpp | 11 +++-- 7 files changed, 74 insertions(+), 22 deletions(-) diff --git a/dbms/include/DB/Parsers/ASTAlterQuery.h b/dbms/include/DB/Parsers/ASTAlterQuery.h index 3454be99222..efc5b9b353d 100644 --- a/dbms/include/DB/Parsers/ASTAlterQuery.h +++ b/dbms/include/DB/Parsers/ASTAlterQuery.h @@ -9,7 +9,8 @@ namespace DB * ALTER TABLE [db.]name_type * ADD COLUMN col_name type [AFTER col_after], * DROP COLUMN col_drop, - * MODIFY COLUMN col_name type + * MODIFY COLUMN col_name type, + * DROP PARTITION partition * ... */ @@ -18,9 +19,10 @@ class ASTAlterQuery : public IAST public: enum ParameterType { - ADD, - DROP, - MODIFY, + ADD_COLUMN, + DROP_COLUMN, + MODIFY_COLUMN, + DROP_PARTITION, NO_TYPE }; @@ -40,12 +42,17 @@ public: */ ASTPtr column; + /** В запросе DROP PARTITION здесь хранится имя partition'а. + */ + ASTPtr partition; + /// deep copy void clone(Parameters & p) const { p.type = type; - p.column = column->clone(); p.name_type = name_type->clone(); + p.column = column->clone(); + p.partition = partition->clone(); } }; typedef std::vector ParameterContainer; @@ -54,6 +61,18 @@ public: String table; + void addParameters(const Parameters & params) + { + parameters.push_back(params); + if (params.name_type) + children.push_back(params.name_type); + if (params.column) + children.push_back(params.column); + if (params.partition) + children.push_back(params.partition); + } + + ASTAlterQuery(StringRange range_ = StringRange()) : IAST(range_) {}; /** Получить текст, который идентифицирует этот элемент. */ diff --git a/dbms/include/DB/Parsers/ParserAlterQuery.h b/dbms/include/DB/Parsers/ParserAlterQuery.h index d027a976c87..d871a78ce55 100644 --- a/dbms/include/DB/Parsers/ParserAlterQuery.h +++ b/dbms/include/DB/Parsers/ParserAlterQuery.h @@ -10,6 +10,7 @@ namespace DB * [ADD COLUMN col_name type [AFTER col_after],] * [DROP COLUMN col_drop, ...] * [MODIFY COLUMN col_modify type, ...] + * [DROP PARTITION partition, ...] */ class ParserAlterQuery : public IParserBase { diff --git a/dbms/include/DB/Storages/AlterCommands.h b/dbms/include/DB/Storages/AlterCommands.h index c2dc7485fa2..50650a1bc56 100644 --- a/dbms/include/DB/Storages/AlterCommands.h +++ b/dbms/include/DB/Storages/AlterCommands.h @@ -6,7 +6,7 @@ namespace DB { -/// Операция из запроса ALTER. Добавление столбцов типа Nested не развернуто в добавление отдельных столбцов. +/// Операция из запроса ALTER (кроме DROP PARTITION). Добавление столбцов типа Nested не развернуто в добавление отдельных столбцов. struct AlterCommand { enum Type diff --git a/dbms/include/DB/Storages/IStorage.h b/dbms/include/DB/Storages/IStorage.h index 71ae27cf918..c7d084839a9 100644 --- a/dbms/include/DB/Storages/IStorage.h +++ b/dbms/include/DB/Storages/IStorage.h @@ -205,6 +205,13 @@ public: throw Exception("Method alter is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /** Выполнить запрос DROP PARTITION. + */ + virtual void dropPartition(const Field & partition) + { + throw Exception("Method dropPartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + /** Выполнить какую-либо фоновую работу. Например, объединение кусков в таблице типа MergeTree. * Возвращает - была ли выполнена какая-либо работа. */ diff --git a/dbms/src/Interpreters/InterpreterAlterQuery.cpp b/dbms/src/Interpreters/InterpreterAlterQuery.cpp index ce5b8ba3294..a7b2f8fdebb 100644 --- a/dbms/src/Interpreters/InterpreterAlterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterAlterQuery.cpp @@ -49,7 +49,7 @@ AlterCommands InterpreterAlterQuery::parseAlter( res.push_back(AlterCommand()); AlterCommand & command = res.back(); - if (params.type == ASTAlterQuery::ADD) + if (params.type == ASTAlterQuery::ADD_COLUMN) { command.type = AlterCommand::ADD; @@ -63,12 +63,12 @@ AlterCommands InterpreterAlterQuery::parseAlter( if (params.column) command.after_column = typeid_cast(*params.column).name; } - else if (params.type == ASTAlterQuery::DROP) + else if (params.type == ASTAlterQuery::DROP_COLUMN) { command.type = AlterCommand::DROP; command.column_name = typeid_cast(*(params.column)).name; } - else if (params.type == ASTAlterQuery::MODIFY) + else if (params.type == ASTAlterQuery::MODIFY_COLUMN) { command.type = AlterCommand::MODIFY; diff --git a/dbms/src/Parsers/ParserAlterQuery.cpp b/dbms/src/Parsers/ParserAlterQuery.cpp index 4fce37d466d..f6c225b79a1 100644 --- a/dbms/src/Parsers/ParserAlterQuery.cpp +++ b/dbms/src/Parsers/ParserAlterQuery.cpp @@ -23,11 +23,13 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e ParserString s_modify("MODIFY", true, true); ParserString s_drop("DROP", true, true); + ParserString s_partition("PARTITION", true, true); ParserString s_comma(","); ParserIdentifier table_parser; ParserCompoundIdentifier parser_name; ParserCompoundNameTypePair parser_name_type; + ParserLiteral parser_literal; ASTPtr table; ASTPtr database; @@ -75,7 +77,8 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e if (s_add.ignore(pos, end, expected)) { ws.ignore(pos, end); - s_column.ignore(pos, end, expected); + if (!s_column.ignore(pos, end, expected)) + return false; ws.ignore(pos, end); parser_name_type.parse(pos, end, params.name_type, expected); @@ -89,29 +92,46 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e return false; } - params.type = ASTAlterQuery::ADD; + params.type = ASTAlterQuery::ADD_COLUMN; } else if (s_drop.ignore(pos, end, expected)) { ws.ignore(pos, end); - s_column.ignore(pos, end, expected); - ws.ignore(pos, end); - parser_name.parse(pos, end, params.column, expected); + if (s_partition.ignore(pos, end, expected)) + { + ws.ignore(pos, end); - params.type = ASTAlterQuery::DROP; + if (!parser_literal.parse(pos, end, params.partition, expected)) + return false; + + params.type = ASTAlterQuery::DROP_PARTITION; + } + else if (s_column.ignore(pos, end, expected)) + { + ws.ignore(pos, end); + + if (!parser_name.parse(pos, end, params.column, expected)) + return false; + + params.type = ASTAlterQuery::DROP_COLUMN; + } + else + return false; } else if (s_modify.ignore(pos, end, expected)) { ws.ignore(pos, end); - s_column.ignore(pos, end, expected); + if (!s_column.ignore(pos, end, expected)) + return false; ws.ignore(pos, end); - parser_name_type.parse(pos, end, params.name_type, expected); + if (!parser_name_type.parse(pos, end, params.name_type, expected)) + return false; ws.ignore(pos, end); - params.type = ASTAlterQuery::MODIFY; + params.type = ASTAlterQuery::MODIFY_COLUMN; } else return false; @@ -124,7 +144,7 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e parsing_finished = true; } - query->parameters.push_back(params); + query->addParameters(params); } while (!parsing_finished); diff --git a/dbms/src/Parsers/formatAST.cpp b/dbms/src/Parsers/formatAST.cpp index 51edde02123..3aa488e8070 100644 --- a/dbms/src/Parsers/formatAST.cpp +++ b/dbms/src/Parsers/formatAST.cpp @@ -721,7 +721,7 @@ void formatAST(const ASTAlterQuery & ast, std::ostream & s, size_t indent, bo { const ASTAlterQuery::Parameters &p = ast.parameters[i]; - if (p.type == ASTAlterQuery::ADD) + if (p.type == ASTAlterQuery::ADD_COLUMN) { s << (hilite ? hilite_keyword : "") << indent_str << "ADD COLUMN " << (hilite ? hilite_none : ""); formatAST(*p.name_type, s, indent, hilite, true); @@ -733,16 +733,21 @@ void formatAST(const ASTAlterQuery & ast, std::ostream & s, size_t indent, bo formatAST(*p.column, s, indent, hilite, one_line); } } - else if (p.type == ASTAlterQuery::DROP) + else if (p.type == ASTAlterQuery::DROP_COLUMN) { s << (hilite ? hilite_keyword : "") << indent_str << "DROP COLUMN " << (hilite ? hilite_none : ""); formatAST(*p.column, s, indent, hilite, true); } - else if (p.type == ASTAlterQuery::MODIFY) + else if (p.type == ASTAlterQuery::MODIFY_COLUMN) { s << (hilite ? hilite_keyword : "") << indent_str << "MODIFY COLUMN " << (hilite ? hilite_none : ""); formatAST(*p.name_type, s, indent, hilite, true); } + else if (p.type == ASTAlterQuery::DROP_PARTITION) + { + s << (hilite ? hilite_keyword : "") << indent_str << "DROP PARTITION " << (hilite ? hilite_none : ""); + formatAST(*p.partition, s, indent, hilite, true); + } else throw Exception("Unexpected type of ALTER", ErrorCodes::UNEXPECTED_AST_STRUCTURE); From 00fefb3df2cbd154e9724c342497aa9c624a0079 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 6 Aug 2014 14:26:35 +0400 Subject: [PATCH 036/127] dbms: interpreting ALTER TABLE DROP PARTITION. Not storage supports it yet. [#METR-12037] --- .../DB/Interpreters/InterpreterAlterQuery.h | 7 ++-- .../Interpreters/InterpreterAlterQuery.cpp | 34 +++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h index 8c2a635d535..6775db9e442 100644 --- a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h +++ b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h @@ -22,11 +22,14 @@ public: /** Изменяет список столбцов в метаданных таблицы на диске. Нужно вызывать под TableStructureLock соответствующей таблицы. */ static void updateMetadata(const String & database, const String & table, const NamesAndTypesList & columns, Context & context); - - static AlterCommands parseAlter(const ASTAlterQuery::ParameterContainer & params, const DataTypeFactory & data_type_factory); private: + typedef std::vector Partitions; + ASTPtr query_ptr; Context context; + + static void parseAlter(const ASTAlterQuery::ParameterContainer & params, const DataTypeFactory & data_type_factory, + AlterCommands & out_commands, Partitions & out_partitions_to_drop); }; } diff --git a/dbms/src/Interpreters/InterpreterAlterQuery.cpp b/dbms/src/Interpreters/InterpreterAlterQuery.cpp index a7b2f8fdebb..da26eae779b 100644 --- a/dbms/src/Interpreters/InterpreterAlterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterAlterQuery.cpp @@ -33,24 +33,27 @@ void InterpreterAlterQuery::execute() ASTAlterQuery & alter = typeid_cast(*query_ptr); String & table_name = alter.table; String database_name = alter.database.empty() ? context.getCurrentDatabase() : alter.database; - AlterCommands commands = parseAlter(alter.parameters, context.getDataTypeFactory()); + AlterCommands commands; + Partitions partitions_to_drop; + parseAlter(alter.parameters, context.getDataTypeFactory(), commands, partitions_to_drop); StoragePtr table = context.getTable(database_name, table_name); + + for (const Field & partition : partitions_to_drop) + table->dropPartition(partition); + table->alter(commands, database_name, table_name, context); } -AlterCommands InterpreterAlterQuery::parseAlter( - const ASTAlterQuery::ParameterContainer & params_container, const DataTypeFactory & data_type_factory) +void InterpreterAlterQuery::parseAlter( + const ASTAlterQuery::ParameterContainer & params_container, const DataTypeFactory & data_type_factory, + AlterCommands & out_commands, Partitions & out_partitions_to_drop) { - AlterCommands res; - for (const auto & params : params_container) { - res.push_back(AlterCommand()); - AlterCommand & command = res.back(); - if (params.type == ASTAlterQuery::ADD_COLUMN) { + AlterCommand command; command.type = AlterCommand::ADD; const ASTNameTypePair & ast_name_type = typeid_cast(*params.name_type); @@ -62,14 +65,20 @@ AlterCommands InterpreterAlterQuery::parseAlter( if (params.column) command.after_column = typeid_cast(*params.column).name; + + out_commands.push_back(command); } else if (params.type == ASTAlterQuery::DROP_COLUMN) { + AlterCommand command; command.type = AlterCommand::DROP; command.column_name = typeid_cast(*(params.column)).name; + + out_commands.push_back(command); } else if (params.type == ASTAlterQuery::MODIFY_COLUMN) { + AlterCommand command; command.type = AlterCommand::MODIFY; const ASTNameTypePair & ast_name_type = typeid_cast(*params.name_type); @@ -78,12 +87,17 @@ AlterCommands InterpreterAlterQuery::parseAlter( command.column_name = ast_name_type.name; command.data_type = data_type_factory.get(type_string); + + out_commands.push_back(command); + } + else if (params.type == ASTAlterQuery::DROP_PARTITION) + { + const Field & partition = dynamic_cast(params.partition).value; + out_partitions_to_drop.push_back(partition); } else throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR); } - - return res; } void InterpreterAlterQuery::updateMetadata( From 20f8e17e847db3a91afecbcbfc49e40e9818f654 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 6 Aug 2014 14:28:36 +0400 Subject: [PATCH 037/127] dbms: interpreting ALTER TABLE DROP PARTITION. Not storage supports it yet. [#METR-12037] --- dbms/src/Interpreters/InterpreterAlterQuery.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/InterpreterAlterQuery.cpp b/dbms/src/Interpreters/InterpreterAlterQuery.cpp index da26eae779b..489e3fe68dd 100644 --- a/dbms/src/Interpreters/InterpreterAlterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterAlterQuery.cpp @@ -92,7 +92,7 @@ void InterpreterAlterQuery::parseAlter( } else if (params.type == ASTAlterQuery::DROP_PARTITION) { - const Field & partition = dynamic_cast(params.partition).value; + const Field & partition = dynamic_cast(*params.partition).value; out_partitions_to_drop.push_back(partition); } else From 7643a49b8e47eb58437eee399b3eb232cff7f499 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 7 Aug 2014 13:23:55 +0400 Subject: [PATCH 038/127] Merge --- dbms/include/DB/Core/ErrorCodes.h | 2 + .../DB/Interpreters/InterpreterAlterQuery.h | 22 +- dbms/include/DB/Parsers/ASTAlterQuery.h | 4 +- dbms/include/DB/Storages/IStorage.h | 2 +- .../MergeTree/AbandonableLockInZooKeeper.h | 14 +- .../DB/Storages/MergeTree/MergeTreeData.h | 4 +- .../ReplicatedMergeTreeBlockOutputStream.h | 21 +- .../DB/Storages/StorageReplicatedMergeTree.h | 17 ++ .../Interpreters/InterpreterAlterQuery.cpp | 28 ++- dbms/src/Parsers/ParserAlterQuery.cpp | 17 ++ dbms/src/Storages/MergeTree/MergeTreeData.cpp | 4 +- .../Storages/StorageReplicatedMergeTree.cpp | 212 +++++++++++++++++- 12 files changed, 301 insertions(+), 46 deletions(-) diff --git a/dbms/include/DB/Core/ErrorCodes.h b/dbms/include/DB/Core/ErrorCodes.h index 72b8f4cc309..73b54c31d5b 100644 --- a/dbms/include/DB/Core/ErrorCodes.h +++ b/dbms/include/DB/Core/ErrorCodes.h @@ -255,6 +255,8 @@ namespace ErrorCodes INVALID_NESTED_NAME, CORRUPTED_DATA, INCORRECT_MARK, + INVALID_PARTITION_NAME, + NOT_LEADER, POCO_EXCEPTION = 1000, STD_EXCEPTION, diff --git a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h index 6775db9e442..e31d6d0ebd0 100644 --- a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h +++ b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h @@ -23,13 +23,31 @@ public: */ static void updateMetadata(const String & database, const String & table, const NamesAndTypesList & columns, Context & context); private: - typedef std::vector Partitions; + struct PartitionCommand + { + enum Type + { + DROP_PARTITION, + }; + + Type type; + + Field partition; + bool detach; /// true для DETACH PARTITION. + + static PartitionCommand dropPartition(const Field & partition, bool detach) + { + return {DROP_PARTITION, partition, detach}; + } + }; + + typedef std::vector PartitionCommands; ASTPtr query_ptr; Context context; static void parseAlter(const ASTAlterQuery::ParameterContainer & params, const DataTypeFactory & data_type_factory, - AlterCommands & out_commands, Partitions & out_partitions_to_drop); + AlterCommands & out_alter_commands, PartitionCommands & out_partition_commands); }; } diff --git a/dbms/include/DB/Parsers/ASTAlterQuery.h b/dbms/include/DB/Parsers/ASTAlterQuery.h index efc5b9b353d..e1b398d76d2 100644 --- a/dbms/include/DB/Parsers/ASTAlterQuery.h +++ b/dbms/include/DB/Parsers/ASTAlterQuery.h @@ -29,7 +29,7 @@ public: struct Parameters { Parameters() : type(NO_TYPE) {} - int type; + int type = NO_TYPE; /** В запросе ADD COLUMN здесь хранится имя и тип добавляемого столбца * В запросе DROP это поле не используется @@ -45,6 +45,7 @@ public: /** В запросе DROP PARTITION здесь хранится имя partition'а. */ ASTPtr partition; + bool detach = false; /// true для DETACH PARTITION. /// deep copy void clone(Parameters & p) const @@ -53,6 +54,7 @@ public: p.name_type = name_type->clone(); p.column = column->clone(); p.partition = partition->clone(); + p.detach = detach; } }; typedef std::vector ParameterContainer; diff --git a/dbms/include/DB/Storages/IStorage.h b/dbms/include/DB/Storages/IStorage.h index c7d084839a9..8b6e7155a9f 100644 --- a/dbms/include/DB/Storages/IStorage.h +++ b/dbms/include/DB/Storages/IStorage.h @@ -207,7 +207,7 @@ public: /** Выполнить запрос DROP PARTITION. */ - virtual void dropPartition(const Field & partition) + virtual void dropPartition(const Field & partition, bool detach) { throw Exception("Method dropPartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } diff --git a/dbms/include/DB/Storages/MergeTree/AbandonableLockInZooKeeper.h b/dbms/include/DB/Storages/MergeTree/AbandonableLockInZooKeeper.h index 09e7a67c653..6f60cd37b94 100644 --- a/dbms/include/DB/Storages/MergeTree/AbandonableLockInZooKeeper.h +++ b/dbms/include/DB/Storages/MergeTree/AbandonableLockInZooKeeper.h @@ -13,7 +13,7 @@ namespace DB * При вызове деструктора или завершении сессии в ZooKeeper, переходит в состояние ABANDONED. * (В том числе при падении программы) */ -class AbandonableLockInZooKeeper +class AbandonableLockInZooKeeper : private boost::noncopyable { public: enum State @@ -34,6 +34,14 @@ public: path = zookeeper.create(path_prefix, holder_path, zkutil::CreateMode::PersistentSequential); } + AbandonableLockInZooKeeper(AbandonableLockInZooKeeper && rhs) + : zookeeper(rhs.zookeeper) + { + std::swap(path_prefix, rhs.path_prefix); + std::swap(path, rhs.path); + std::swap(holder_path, rhs.holder_path); + } + String getPath() { return path; @@ -49,6 +57,7 @@ public: { zookeeper.remove(path); zookeeper.remove(holder_path); + holder_path = ""; } /// Добавляет в список действия, эквивалентные unlock(). @@ -60,6 +69,9 @@ public: ~AbandonableLockInZooKeeper() { + if (holder_path.empty()) + return; + try { zookeeper.tryRemove(holder_path); diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h index 0aa257df3a5..b211faab12e 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h @@ -630,10 +630,10 @@ public: */ void replaceParts(const DataPartsVector & remove, const DataPartsVector & add, bool clear_without_timeout); - /** Переименовывает кусок в prefix_кусок и убирает его из рабочего набора. + /** Переименовывает кусок в detached/prefix_кусок и забывает про него. Данные не будут удалены в clearOldParts. * Если restore_covered, добавляет в рабочий набор неактивные куски, слиянием которых получен удаляемый кусок. */ - void renameAndDetachPart(DataPartPtr part, const String & prefix, bool restore_covered = false); + void renameAndDetachPart(DataPartPtr part, const String & prefix = "", bool restore_covered = false); /** Возвращает старые неактуальные куски, которые можно удалить. Одновременно удаляет их из списка кусков, но не с диска. */ diff --git a/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h b/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h index d95de7fce4e..a6ac85af4aa 100644 --- a/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h +++ b/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h @@ -28,26 +28,7 @@ public: time_t min_date_time = DateLUT::instance().fromDayNum(DayNum_t(current_block.min_date)); String month_name = toString(Date2OrderedIdentifier(min_date_time) / 100); - String month_path = storage.zookeeper_path + "/block_numbers/" + month_name; - if (!storage.zookeeper->exists(month_path)) - { - /// Создадим в block_numbers ноду для месяца и пропустим в ней 200 значений инкремента. - /// Нужно, чтобы в будущем при необходимости можно было добавить данные в начало. - zkutil::Ops ops; - auto acl = storage.zookeeper->getDefaultACL(); - ops.push_back(new zkutil::Op::Create(month_path, "", acl, zkutil::CreateMode::Persistent)); - for (size_t i = 0; i < 200; ++i) - { - ops.push_back(new zkutil::Op::Create(month_path + "/skip_increment", "", acl, zkutil::CreateMode::Persistent)); - ops.push_back(new zkutil::Op::Remove(month_path + "/skip_increment", -1)); - } - /// Игнорируем ошибки - не получиться могло только если кто-то еще выполнил эту строчку раньше нас. - storage.zookeeper->tryMulti(ops); - } - - AbandonableLockInZooKeeper block_number_lock( - storage.zookeeper_path + "/block_numbers/" + month_name + "/block-", - storage.zookeeper_path + "/temp", *storage.zookeeper); + AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name); UInt64 part_number = block_number_lock.getNumber(); diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index f86f6c91f4d..73c00a3e1ba 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -6,6 +6,7 @@ #include #include #include +#include "MergeTree/AbandonableLockInZooKeeper.h" #include #include #include @@ -77,6 +78,8 @@ public: void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override; + void dropPartition(const Field & partition, bool detach) override; + /** Удаляет реплику из ZooKeeper. Если других реплик нет, удаляет всю таблицу из ZooKeeper. */ void drop() override; @@ -139,9 +142,13 @@ private: Type type; String source_replica; /// Пустая строка значит, что эта запись была добавлена сразу в очередь, а не скопирована из лога. + String new_part_name; /// Для DROP_RANGE имя несуществующего куска. Нужно удалить все куски, покрытые им. + Strings parts_to_merge; + bool detach = false; /// Для DROP_RANGE, true значит, что куски нужно не удалить, а перенести в директорию detached. + FuturePartTaggerPtr future_part_tagger; bool currently_executing = false; /// Доступ под queue_mutex. std::condition_variable execution_complete; /// Пробуждается когда currently_executing становится false. @@ -269,6 +276,7 @@ private: /// Поток, выбирающий куски для слияния. std::thread merge_selecting_thread; Poco::Event merge_selecting_event; + std::mutex merge_selecting_mutex; /// Берется на каждую итерацию выбора кусков для слияния. /// Поток, удаляющий старые куски, записи в логе и блоки. std::thread cleanup_thread; @@ -433,6 +441,15 @@ private: /** Скачать указанный кусок с указанной реплики. */ void fetchPart(const String & part_name, const String & replica_name); + + /// + + AbandonableLockInZooKeeper allocateBlockNumber(const String & month_name); + + /** Дождаться, пока все реплики, включая эту, выполнят указанное действие из лога. + * Если одновременно с этим добавляются реплики, может не дождаться добавленную реплику. + */ + void waitForAllReplicasToProcessLogEntry(const String & log_znode_path, const LogEntry & entry); }; } diff --git a/dbms/src/Interpreters/InterpreterAlterQuery.cpp b/dbms/src/Interpreters/InterpreterAlterQuery.cpp index 489e3fe68dd..e9c54622841 100644 --- a/dbms/src/Interpreters/InterpreterAlterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterAlterQuery.cpp @@ -33,21 +33,27 @@ void InterpreterAlterQuery::execute() ASTAlterQuery & alter = typeid_cast(*query_ptr); String & table_name = alter.table; String database_name = alter.database.empty() ? context.getCurrentDatabase() : alter.database; - AlterCommands commands; - Partitions partitions_to_drop; - parseAlter(alter.parameters, context.getDataTypeFactory(), commands, partitions_to_drop); + AlterCommands alter_commands; + PartitionCommands partition_commands; + parseAlter(alter.parameters, context.getDataTypeFactory(), alter_commands, partition_commands); StoragePtr table = context.getTable(database_name, table_name); - for (const Field & partition : partitions_to_drop) - table->dropPartition(partition); + for (const PartitionCommand & command : partition_commands) + { + if (command.type == PartitionCommand::DROP_PARTITION) + table->dropPartition(command.partition, command.detach); + else + throw Exception("Bad PartitionCommand::Type: " + toString(command.type), ErrorCodes::ARGUMENT_OUT_OF_BOUND); + } - table->alter(commands, database_name, table_name, context); + if (!alter_commands.empty()) + table->alter(alter_commands, database_name, table_name, context); } void InterpreterAlterQuery::parseAlter( const ASTAlterQuery::ParameterContainer & params_container, const DataTypeFactory & data_type_factory, - AlterCommands & out_commands, Partitions & out_partitions_to_drop) + AlterCommands & out_alter_commands, PartitionCommands & out_partition_commands) { for (const auto & params : params_container) { @@ -66,7 +72,7 @@ void InterpreterAlterQuery::parseAlter( if (params.column) command.after_column = typeid_cast(*params.column).name; - out_commands.push_back(command); + out_alter_commands.push_back(command); } else if (params.type == ASTAlterQuery::DROP_COLUMN) { @@ -74,7 +80,7 @@ void InterpreterAlterQuery::parseAlter( command.type = AlterCommand::DROP; command.column_name = typeid_cast(*(params.column)).name; - out_commands.push_back(command); + out_alter_commands.push_back(command); } else if (params.type == ASTAlterQuery::MODIFY_COLUMN) { @@ -88,12 +94,12 @@ void InterpreterAlterQuery::parseAlter( command.column_name = ast_name_type.name; command.data_type = data_type_factory.get(type_string); - out_commands.push_back(command); + out_alter_commands.push_back(command); } else if (params.type == ASTAlterQuery::DROP_PARTITION) { const Field & partition = dynamic_cast(*params.partition).value; - out_partitions_to_drop.push_back(partition); + out_partition_commands.push_back(PartitionCommand::dropPartition(partition, params.detach)); } else throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR); diff --git a/dbms/src/Parsers/ParserAlterQuery.cpp b/dbms/src/Parsers/ParserAlterQuery.cpp index f6c225b79a1..00a5229ddde 100644 --- a/dbms/src/Parsers/ParserAlterQuery.cpp +++ b/dbms/src/Parsers/ParserAlterQuery.cpp @@ -23,6 +23,7 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e ParserString s_modify("MODIFY", true, true); ParserString s_drop("DROP", true, true); + ParserString s_detach("DETACH", true, true); ParserString s_partition("PARTITION", true, true); ParserString s_comma(","); @@ -115,10 +116,26 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e return false; params.type = ASTAlterQuery::DROP_COLUMN; + params.detach = false; } else return false; } + else if (s_detach.ignore(pos, end, expected)) + { + ws.ignore(pos, end); + + if (!s_partition.ignore(pos, end, expected)) + return false; + + ws.ignore(pos, end); + + if (!parser_literal.parse(pos, end, params.partition, expected)) + return false; + + params.type = ASTAlterQuery::DROP_PARTITION; + params.detach = true; + } else if (s_modify.ignore(pos, end, expected)) { ws.ignore(pos, end); diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index e44a196bf94..3b06ecdd3d8 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -41,6 +41,7 @@ MergeTreeData::MergeTreeData( { /// создаём директорию, если её нет Poco::File(full_path).createDirectories(); + Poco::File(full_path + "detached").createDirectory(); /// инициализируем описание сортировки sort_descr.reserve(primary_expr_ast->children.size()); @@ -104,6 +105,7 @@ void MergeTreeData::loadDataParts() if (0 == file_name.compare(0, strlen("tmp_"), "tmp_")) continue; + /// TODO: Это можно удалить, если нигде больше не осталось директорий old_* (их давно никто не пишет). if (0 == file_name.compare(0, strlen("old_"), "old_")) { String new_file_name = file_name.substr(strlen("old_")); @@ -731,7 +733,7 @@ void MergeTreeData::renameAndDetachPart(DataPartPtr part, const String & prefix, throw Exception("No such data part", ErrorCodes::NO_SUCH_DATA_PART); data_parts.erase(part); - part->renameAddPrefix(prefix); + part->renameAddPrefix("detached/" + prefix); if (restore_covered) { diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index d79e13b7004..885c8f57dfd 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -961,7 +961,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTree::LogEntry & entry) { - LOG_INFO(log, "Removing parts covered by " << entry.new_part_name << "."); + LOG_INFO(log, (entry.detach ? "Detaching" : "Removing") << " parts inside " << entry.new_part_name << "."); { LogEntries to_wait; @@ -998,7 +998,7 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr } } - LOG_DEBUG(log, "Removing parts."); + LOG_DEBUG(log, (entry.detach ? "Detaching" : "Removing") << " parts."); size_t removed_parts = 0; /// Удалим куски, содержащиеся в удаляемом диапазоне. @@ -1010,16 +1010,40 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr LOG_DEBUG(log, "Removing part " << part->name); ++removed_parts; + /// Если кусок удалять не нужно, надежнее переместить директорию до изменений в ZooKeeper. + if (entry.detach) + data.renameAndDetachPart(part); + zkutil::Ops ops; ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name + "/columns", -1)); ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name + "/checksums", -1)); ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name, -1)); zookeeper->multi(ops); - data.replaceParts({part}, {}, false); + /// Если кусок нужно удалить не нужно, надежнее удалить директорию после изменений в ZooKeeper. + if (!entry.detach) + data.replaceParts({part}, {}, false); } - LOG_INFO(log, "Finished removing parts covered by " << entry.new_part_name << "."); + LOG_INFO(log, (entry.detach ? "Detached" : "Removed") << removed_parts << " parts inside " << entry.new_part_name << "."); + + if (unreplicated_data) + { + removed_parts = 0; + parts = unreplicated_data->getDataParts(); + for (const auto & part : parts) + { + if (!ActiveDataPartSet::contains(entry.new_part_name, part->name)) + continue; + LOG_DEBUG(log, "Removing unreplicated part " << part->name); + ++removed_parts; + + if (entry.detach) + unreplicated_data->renameAndDetachPart(part, ""); + else + unreplicated_data->replaceParts({part}, {}, false); + } + } return true; } @@ -1143,6 +1167,8 @@ void StorageReplicatedMergeTree::mergeSelectingThread() try { + std::unique_lock merge_selecting_lock(merge_selecting_mutex); + if (need_pull) { /// Нужно загрузить новую запись в очередь перед тем, как выбирать куски для слияния. @@ -1266,6 +1292,9 @@ void StorageReplicatedMergeTree::cleanupThread() { clearOldParts(); + if (unreplicated_data) + unreplicated_data->clearOldParts(); + if (is_leader_node) { clearOldLogs(); @@ -1989,7 +2018,8 @@ bool StorageReplicatedMergeTree::optimize() return true; } -void StorageReplicatedMergeTree::alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) +void StorageReplicatedMergeTree::alter(const AlterCommands & params, + const String & database_name, const String & table_name, Context & context) { LOG_DEBUG(log, "Doing ALTER"); @@ -2077,6 +2107,90 @@ void StorageReplicatedMergeTree::alter(const AlterCommands & params, const Strin LOG_DEBUG(log, "ALTER finished"); } +static bool isValidMonthName(const String & s) +{ + if (s.size() != 6) + return false; + if (!std::all_of(s.begin(), s.end(), isdigit)) + return false; + DayNum_t date = DateLUT::instance().toDayNum(OrderedIdentifier2Date(s + "01")); + /// Не можем просто сравнить date с нулем, потому что 0 тоже валидный DayNum. + return s == toString(Date2OrderedIdentifier(DateLUT::instance().fromDayNum(date)) / 100); +} + +/// Название воображаемого куска, покрывающего все возможные куски в указанном месяце с номерами в указанном диапазоне. +static String getFakePartNameForDrop(const String & month_name, UInt64 left, UInt64 right) +{ + /// Диапазон дат - весь месяц. + DateLUT & lut = DateLUT::instance(); + time_t start_time = OrderedIdentifier2Date(month_name + "01"); + DayNum_t left_date = lut.toDayNum(start_time); + DayNum_t right_date = DayNum_t(static_cast(left_date) + lut.daysInMonth(start_time) - 1); + + /// Уровень - right-left+1: кусок не мог образоваться в результате такого или большего количества слияний. + return ActiveDataPartSet::getPartName(left_date, right_date, left, right, right - left + 1); +} + +void StorageReplicatedMergeTree::dropPartition(const Field & field, bool detach) +{ + String month_name; + + if (field.getType() == Field::Types::UInt64) + month_name = toString(field.get()); + else + month_name = field.safeGet(); + + if (!isValidMonthName(month_name)) + throw Exception("Invalid partition format: " + month_name + ". Partition should consist of 6 digits: YYYYMM", + ErrorCodes::INVALID_PARTITION_NAME); + + /// TODO: Делать запрос в лидера по TCP. + if (!is_leader_node) + throw Exception("DROP PARTITION can only be done on leader replica.", ErrorCodes::NOT_LEADER); + + + /** Пропустим один номер в block_numbers для удаляемого месяца, и будем удалять только куски до этого номера. + * Это запретит мерджи удаляемых кусков с новыми вставляемыми данными. + * Инвариант: в логе не появятся слияния удаляемых кусков с другими кусками. + * NOTE: Если понадобится аналогично поддержать запрос DROP PART, для него придется придумать какой-нибудь новый механизм, + * чтобы гарантировать этот инвариант. + */ + UInt64 right; + + { + AbandonableLockInZooKeeper block_number_lock = allocateBlockNumber(month_name); + right = block_number_lock.getNumber(); + block_number_lock.unlock(); + } + + /// Такого никогда не должно происходить. + if (right == 0) + return; + --right; + + String fake_part_name = getFakePartNameForDrop(month_name, 0, right); + + /** Запретим выбирать для слияния удаляемые куски - сделаем вид, что их всех уже собираются слить в fake_part_name. + * Инвариант: после появления в логе записи DROP_RANGE, в логе не появятся слияния удаляемых кусков. + */ + { + std::unique_lock merge_selecting_lock(merge_selecting_mutex); + + virtual_parts.add(fake_part_name); + } + + /// Наконец, добившись нужны инвариантов, можно положить запись в лог. + LogEntry entry; + entry.type = LogEntry::DROP_RANGE; + entry.source_replica = replica_name; + entry.new_part_name = fake_part_name; + entry.detach = detach; + String log_znode_path = zookeeper->create(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential); + + /// Дождемся, пока все реплики выполнят дроп. + waitForAllReplicasToProcessLogEntry(log_znode_path, entry); +} + void StorageReplicatedMergeTree::drop() { if (!zookeeper) @@ -2114,6 +2228,86 @@ void StorageReplicatedMergeTree::rename(const String & new_path_to_db, const Str /// TODO: Можно обновить названия логгеров. } +AbandonableLockInZooKeeper StorageReplicatedMergeTree::allocateBlockNumber(const String & month_name) +{ + String month_path = zookeeper_path + "/block_numbers/" + month_name; + if (!zookeeper->exists(month_path)) + { + /// Создадим в block_numbers ноду для месяца и пропустим в ней 200 значений инкремента. + /// Нужно, чтобы в будущем при необходимости можно было добавить данные в начало. + zkutil::Ops ops; + auto acl = zookeeper->getDefaultACL(); + ops.push_back(new zkutil::Op::Create(month_path, "", acl, zkutil::CreateMode::Persistent)); + for (size_t i = 0; i < 200; ++i) + { + ops.push_back(new zkutil::Op::Create(month_path + "/skip_increment", "", acl, zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Remove(month_path + "/skip_increment", -1)); + } + /// Игнорируем ошибки - не получиться могло только если кто-то еще выполнил эту строчку раньше нас. + zookeeper->tryMulti(ops); + } + + return AbandonableLockInZooKeeper( + zookeeper_path + "/block_numbers/" + month_name + "/block-", + zookeeper_path + "/temp", *zookeeper); +} + +void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const String & log_znode_path, const LogEntry & entry) +{ + UInt64 log_index = parse(log_znode_path.substr(log_znode_path.size() - 10)); + String log_entry_str = entry.toString(); + + Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); + for (const String & replica : replicas) + { + /// Дождемся, пока запись попадет в очередь реплики. + while (true) + { + zkutil::EventPtr event = new Poco::Event; + + String pointer = zookeeper->get(zookeeper_path + "/replicas/" + replica + "/log_pointer", nullptr, event); + if (!pointer.empty() && parse(pointer) > log_index) + break; + + event->wait(); + } + + /// Найдем запись в очереди реплики. + Strings queue_entries = zookeeper->getChildren(zookeeper_path + "/replicas/" + replica + "/queue"); + String entry_to_wait_for; + + for (const String & entry_name : queue_entries) + { + String queue_entry_str; + auto code = zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_name, queue_entry_str); + if (code == ZOK && queue_entry_str == log_entry_str) + { + entry_to_wait_for = entry_name; + break; + } + } + + /// Пока искали запись, ее уже выполнили и удалили. + if (entry_to_wait_for.empty()) + continue; + + /// Дождемся, пока запись исчезнет из очереди реплики. + while (true) + { + zkutil::EventPtr event = new Poco::Event; + + String unused; + /// get вместо exists, чтобы не утек watch, если ноды уже нет. + auto code = zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_to_wait_for, unused, nullptr, event); + if (code == ZNONODE) + break; + + event->wait(); + } + } +} + + void StorageReplicatedMergeTree::LogEntry::writeText(WriteBuffer & out) const { writeString("format version: 1\n", out); @@ -2137,7 +2331,10 @@ void StorageReplicatedMergeTree::LogEntry::writeText(WriteBuffer & out) const writeString(new_part_name, out); break; case DROP_RANGE: - writeString("drop\n", out); + if (detach) + writeString("detach\n", out); + else + writeString("drop\n", out); writeString(new_part_name, out); break; } @@ -2174,9 +2371,10 @@ void StorageReplicatedMergeTree::LogEntry::readText(ReadBuffer & in) } readString(new_part_name, in); } - else if (type_str == "drop") + else if (type_str == "drop" || type_str == "detach") { type = DROP_RANGE; + detach = type_str == "detach"; readString(new_part_name, in); } assertString("\n", in); From 9bec7e4a28bcda7c589118a639ce7dcbdf46477a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 7 Aug 2014 15:46:01 +0400 Subject: [PATCH 039/127] Merge --- .../DB/Interpreters/InterpreterAlterQuery.h | 9 ++++++ dbms/include/DB/Parsers/ASTAlterQuery.h | 7 +++-- dbms/include/DB/Storages/IStorage.h | 9 +++++- .../DB/Storages/StorageReplicatedMergeTree.h | 1 + .../Interpreters/InterpreterAlterQuery.cpp | 7 +++++ dbms/src/Parsers/ParserAlterQuery.cpp | 25 +++++++++++++++ dbms/src/Parsers/formatAST.cpp | 9 +++++- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 2 -- .../Storages/StorageReplicatedMergeTree.cpp | 31 ++++++++++++++----- 9 files changed, 86 insertions(+), 14 deletions(-) diff --git a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h index e31d6d0ebd0..0b0416f0941 100644 --- a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h +++ b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h @@ -28,6 +28,7 @@ private: enum Type { DROP_PARTITION, + ATTACH_PARTITION, }; Type type; @@ -35,10 +36,18 @@ private: Field partition; bool detach; /// true для DETACH PARTITION. + bool unreplicated; + bool part; + static PartitionCommand dropPartition(const Field & partition, bool detach) { return {DROP_PARTITION, partition, detach}; } + + static PartitionCommand attachPartition(const Field & partition, bool unreplicated, bool part) + { + return {ATTACH_PARTITION, partition, false, part, unreplicated}; + } }; typedef std::vector PartitionCommands; diff --git a/dbms/include/DB/Parsers/ASTAlterQuery.h b/dbms/include/DB/Parsers/ASTAlterQuery.h index e1b398d76d2..897db72a0b9 100644 --- a/dbms/include/DB/Parsers/ASTAlterQuery.h +++ b/dbms/include/DB/Parsers/ASTAlterQuery.h @@ -23,6 +23,7 @@ public: DROP_COLUMN, MODIFY_COLUMN, DROP_PARTITION, + ATTACH_PARTITION, NO_TYPE }; @@ -47,14 +48,16 @@ public: ASTPtr partition; bool detach = false; /// true для DETACH PARTITION. + bool part = false; /// true для ATTACH [UNREPLICATED] PART + bool unreplicated = false; /// true для ATTACH UNREPLICATED ... + /// deep copy void clone(Parameters & p) const { - p.type = type; + p = *this; p.name_type = name_type->clone(); p.column = column->clone(); p.partition = partition->clone(); - p.detach = detach; } }; typedef std::vector ParameterContainer; diff --git a/dbms/include/DB/Storages/IStorage.h b/dbms/include/DB/Storages/IStorage.h index 8b6e7155a9f..70ed1b158b4 100644 --- a/dbms/include/DB/Storages/IStorage.h +++ b/dbms/include/DB/Storages/IStorage.h @@ -205,13 +205,20 @@ public: throw Exception("Method alter is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } - /** Выполнить запрос DROP PARTITION. + /** Выполнить запрос (DROP|DETACH) PARTITION. */ virtual void dropPartition(const Field & partition, bool detach) { throw Exception("Method dropPartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /** Выполнить запрос ATTACH [UNREPLICATED] (PART|PARTITION). + */ + virtual void attachPartition(const Field & partition, bool unreplicated, bool part) + { + throw Exception("Method attachPartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + /** Выполнить какую-либо фоновую работу. Например, объединение кусков в таблице типа MergeTree. * Возвращает - была ли выполнена какая-либо работа. */ diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 73c00a3e1ba..38bc341e68c 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -79,6 +79,7 @@ public: void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override; void dropPartition(const Field & partition, bool detach) override; + void attachPartition(const Field & partition, bool unreplicated, bool part) override; /** Удаляет реплику из ZooKeeper. Если других реплик нет, удаляет всю таблицу из ZooKeeper. */ diff --git a/dbms/src/Interpreters/InterpreterAlterQuery.cpp b/dbms/src/Interpreters/InterpreterAlterQuery.cpp index e9c54622841..4135b01bb6e 100644 --- a/dbms/src/Interpreters/InterpreterAlterQuery.cpp +++ b/dbms/src/Interpreters/InterpreterAlterQuery.cpp @@ -43,6 +43,8 @@ void InterpreterAlterQuery::execute() { if (command.type == PartitionCommand::DROP_PARTITION) table->dropPartition(command.partition, command.detach); + else if (command.type == PartitionCommand::ATTACH_PARTITION) + table->attachPartition(command.partition, command.unreplicated, command.part); else throw Exception("Bad PartitionCommand::Type: " + toString(command.type), ErrorCodes::ARGUMENT_OUT_OF_BOUND); } @@ -101,6 +103,11 @@ void InterpreterAlterQuery::parseAlter( const Field & partition = dynamic_cast(*params.partition).value; out_partition_commands.push_back(PartitionCommand::dropPartition(partition, params.detach)); } + else if (params.type == ASTAlterQuery::ATTACH_PARTITION) + { + const Field & partition = dynamic_cast(*params.partition).value; + out_partition_commands.push_back(PartitionCommand::attachPartition(partition, params.unreplicated, params.part)); + } else throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR); } diff --git a/dbms/src/Parsers/ParserAlterQuery.cpp b/dbms/src/Parsers/ParserAlterQuery.cpp index 00a5229ddde..b07554add15 100644 --- a/dbms/src/Parsers/ParserAlterQuery.cpp +++ b/dbms/src/Parsers/ParserAlterQuery.cpp @@ -24,6 +24,9 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e ParserString s_drop("DROP", true, true); ParserString s_detach("DETACH", true, true); + ParserString s_attach("ATTACH", true, true); + ParserString s_unreplicated("UNREPLICATED", true, true); + ParserString s_part("PART", true, true); ParserString s_partition("PARTITION", true, true); ParserString s_comma(","); @@ -136,6 +139,28 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & e params.type = ASTAlterQuery::DROP_PARTITION; params.detach = true; } + else if (s_attach.ignore(pos, end, expected)) + { + ws.ignore(pos, end); + + if (s_unreplicated.ignore(pos, end, expected)) + { + params.unreplicated = true; + ws.ignore(pos, end); + } + + if (s_part.ignore(pos, end, expected)) + params.part = true; + else if (!s_partition.ignore(pos, end, expected)) + return false; + + ws.ignore(pos, end); + + if (!parser_literal.parse(pos, end, params.partition, expected)) + return false; + + params.type = ASTAlterQuery::ATTACH_PARTITION; + } else if (s_modify.ignore(pos, end, expected)) { ws.ignore(pos, end); diff --git a/dbms/src/Parsers/formatAST.cpp b/dbms/src/Parsers/formatAST.cpp index 3aa488e8070..1d0c08351e3 100644 --- a/dbms/src/Parsers/formatAST.cpp +++ b/dbms/src/Parsers/formatAST.cpp @@ -745,7 +745,14 @@ void formatAST(const ASTAlterQuery & ast, std::ostream & s, size_t indent, bo } else if (p.type == ASTAlterQuery::DROP_PARTITION) { - s << (hilite ? hilite_keyword : "") << indent_str << "DROP PARTITION " << (hilite ? hilite_none : ""); + s << (hilite ? hilite_keyword : "") << indent_str << (p.detach ? "DETACH" : "DROP") << " PARTITION " + << (hilite ? hilite_none : ""); + formatAST(*p.partition, s, indent, hilite, true); + } + else if (p.type == ASTAlterQuery::ATTACH_PARTITION) + { + s << (hilite ? hilite_keyword : "") << indent_str << "ATTACH " << (p.unreplicated ? "UNREPLICATED" : "") + << (p.part ? " PART " : " PARTITION ") << (hilite ? hilite_none : ""); formatAST(*p.partition, s, indent, hilite, true); } else diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index 3b06ecdd3d8..c7eced0b86c 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -707,8 +707,6 @@ MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( void MergeTreeData::replaceParts(const DataPartsVector & remove, const DataPartsVector & add, bool clear_without_timeout) { - LOG_TRACE(log, "Removing " << remove.size() << " parts and adding " << add.size() << " parts."); - Poco::ScopedLock lock(data_parts_mutex); for (const DataPartPtr & part : remove) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 885c8f57dfd..49bd9f7ba44 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -991,11 +991,8 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr "Waiting for " << to_wait.size() << " entries that are currently executing."); /// Дождемся завершения операций с кусками, содержащимися в удаляемом диапазоне. - { - std::unique_lock lock(queue_mutex); - for (LogEntryPtr & entry : to_wait) - entry->execution_complete.wait(lock, [&entry] { return !entry->currently_executing; }); - } + for (LogEntryPtr & entry : to_wait) + entry->execution_complete.wait(lock, [&entry] { return !entry->currently_executing; }); } LOG_DEBUG(log, (entry.detach ? "Detaching" : "Removing") << " parts."); @@ -1020,15 +1017,17 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr ops.push_back(new zkutil::Op::Remove(replica_path + "/parts/" + part->name, -1)); zookeeper->multi(ops); - /// Если кусок нужно удалить не нужно, надежнее удалить директорию после изменений в ZooKeeper. + /// Если кусок нужно удалить, надежнее удалить директорию после изменений в ZooKeeper. if (!entry.detach) data.replaceParts({part}, {}, false); } - LOG_INFO(log, (entry.detach ? "Detached" : "Removed") << removed_parts << " parts inside " << entry.new_part_name << "."); + LOG_INFO(log, (entry.detach ? "Detached " : "Removed ") << removed_parts << " parts inside " << entry.new_part_name << "."); if (unreplicated_data) { + Poco::ScopedLock unreplicated_lock(unreplicated_mutex); + removed_parts = 0; parts = unreplicated_data->getDataParts(); for (const auto & part : parts) @@ -2179,18 +2178,24 @@ void StorageReplicatedMergeTree::dropPartition(const Field & field, bool detach) virtual_parts.add(fake_part_name); } - /// Наконец, добившись нужны инвариантов, можно положить запись в лог. + /// Наконец, добившись нужных инвариантов, можно положить запись в лог. LogEntry entry; entry.type = LogEntry::DROP_RANGE; entry.source_replica = replica_name; entry.new_part_name = fake_part_name; entry.detach = detach; String log_znode_path = zookeeper->create(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential); + entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1); /// Дождемся, пока все реплики выполнят дроп. waitForAllReplicasToProcessLogEntry(log_znode_path, entry); } +void StorageReplicatedMergeTree::attachPartition(const Field& partition, bool unreplicated, bool part) +{ + throw Exception("Not implemented", ErrorCodes::NOT_IMPLEMENTED); +} + void StorageReplicatedMergeTree::drop() { if (!zookeeper) @@ -2254,12 +2259,16 @@ AbandonableLockInZooKeeper StorageReplicatedMergeTree::allocateBlockNumber(const void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const String & log_znode_path, const LogEntry & entry) { + LOG_DEBUG(log, "Waiting for all replicas to process " << entry.znode_name); + UInt64 log_index = parse(log_znode_path.substr(log_znode_path.size() - 10)); String log_entry_str = entry.toString(); Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); for (const String & replica : replicas) { + LOG_DEBUG(log, "Waiting for " << replica << " to pull " << entry.znode_name << " to queue"); + /// Дождемся, пока запись попадет в очередь реплики. while (true) { @@ -2272,6 +2281,8 @@ void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const Strin event->wait(); } + LOG_DEBUG(log, "Looking for " << entry.znode_name << " in " << replica << " queue"); + /// Найдем запись в очереди реплики. Strings queue_entries = zookeeper->getChildren(zookeeper_path + "/replicas/" + replica + "/queue"); String entry_to_wait_for; @@ -2291,6 +2302,8 @@ void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const Strin if (entry_to_wait_for.empty()) continue; + LOG_DEBUG(log, "Waiting for " << entry_to_wait_for << " to disappear from " << replica << " queue"); + /// Дождемся, пока запись исчезнет из очереди реплики. while (true) { @@ -2305,6 +2318,8 @@ void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const Strin event->wait(); } } + + LOG_DEBUG(log, "Finished waiting for all replicas to process " << entry.znode_name); } From a33ab0b36ce0ad07f1d406ced77c05aef10dd8c8 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 7 Aug 2014 16:25:48 +0400 Subject: [PATCH 040/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 49bd9f7ba44..3e24fbe758c 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -2290,8 +2290,8 @@ void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const Strin for (const String & entry_name : queue_entries) { String queue_entry_str; - auto code = zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_name, queue_entry_str); - if (code == ZOK && queue_entry_str == log_entry_str) + bool exists = zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_name, queue_entry_str); + if (exists && queue_entry_str == log_entry_str) { entry_to_wait_for = entry_name; break; @@ -2311,8 +2311,7 @@ void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const Strin String unused; /// get вместо exists, чтобы не утек watch, если ноды уже нет. - auto code = zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_to_wait_for, unused, nullptr, event); - if (code == ZNONODE) + if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/queue/" + entry_to_wait_for, unused, nullptr, event)) break; event->wait(); From 857f7db99d2202ce7f228088e3dba313358033d2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 7 Aug 2014 23:20:30 +0400 Subject: [PATCH 041/127] Merge --- dbms/include/DB/Storages/MergeTree/MergeTreeData.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h index b211faab12e..ad5521b0e1a 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h @@ -110,7 +110,7 @@ struct MergeTreeSettings double insert_delay_step = 1.1; /// Для скольки последних блоков хранить хеши в ZooKeeper. - size_t replicated_deduplication_window = 1000; + size_t replicated_deduplication_window = 100; /// Хранить примерно столько последних записей в логе в ZooKeeper, даже если они никому уже не нужны. /// Не влияет на работу таблиц; используется только чтобы успеть посмотреть на лог в ZooKeeper глазами прежде, чем его очистят. From beca178f2fb15bcc5630f0948f375cba7fb347fc Mon Sep 17 00:00:00 2001 From: Andrey Mironov Date: Mon, 4 Aug 2014 19:25:38 +0400 Subject: [PATCH 042/127] dbms: support WITH TOTALS without GROUP BY --- .../DB/Interpreters/ExpressionAnalyzer.h | 4 +-- .../DB/Parsers/ExpressionElementParsers.h | 2 +- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 31 +++++++++++++++++-- .../Interpreters/InterpreterSelectQuery.cpp | 4 +++ dbms/src/Parsers/ExpressionElementParsers.cpp | 2 +- dbms/src/Parsers/ParserSelectQuery.cpp | 18 +++++------ dbms/src/Parsers/formatAST.cpp | 6 ++-- dbms/src/Server/TCPHandler.cpp | 4 ++- 8 files changed, 51 insertions(+), 20 deletions(-) diff --git a/dbms/include/DB/Interpreters/ExpressionAnalyzer.h b/dbms/include/DB/Interpreters/ExpressionAnalyzer.h index 4d058e69b8d..0fec5ffdea2 100644 --- a/dbms/include/DB/Interpreters/ExpressionAnalyzer.h +++ b/dbms/include/DB/Interpreters/ExpressionAnalyzer.h @@ -220,8 +220,8 @@ private: void normalizeTree(); void normalizeTreeImpl(ASTPtr & ast, MapOfASTs & finished_asts, SetOfASTs & current_asts, std::string current_alias); - /// Eliminates injective function calls from group by statement - void eliminateInjectives(); + /// Eliminates injective function calls and constant expressions from group by statement + void optimizeGroupBy(); /// Превратить перечисление значений или подзапрос в ASTSet. node - функция in или notIn. void makeSet(ASTFunction * node, const Block & sample_block); diff --git a/dbms/include/DB/Parsers/ExpressionElementParsers.h b/dbms/include/DB/Parsers/ExpressionElementParsers.h index 65b7ec53b0e..ccbd68bbd75 100644 --- a/dbms/include/DB/Parsers/ExpressionElementParsers.h +++ b/dbms/include/DB/Parsers/ExpressionElementParsers.h @@ -22,7 +22,7 @@ protected: class ParserParenthesisExpression : public IParserBase { protected: - const char * getName() const { return "expression in parenthesis"; } + const char * getName() const { return "parenthesized expression"; } bool parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & expected); }; diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 364d7b7e884..9aad597d3b0 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -69,7 +69,7 @@ void ExpressionAnalyzer::init() normalizeTree(); /// GROUP BY injective function elimination - eliminateInjectives(); + optimizeGroupBy(); /// array_join_alias_to_name, array_join_result_to_source. getArrayJoinedColumns(); @@ -122,7 +122,7 @@ void ExpressionAnalyzer::analyzeAggregation() if (select_query->group_expression_list) { NameSet unique_keys; - const ASTs & group_asts = select_query->group_expression_list->children; + auto & group_asts = select_query->group_expression_list->children; for (size_t i = 0; i < group_asts.size(); ++i) { getRootActions(group_asts[i], true, false, temp_actions); @@ -135,6 +135,17 @@ void ExpressionAnalyzer::analyzeAggregation() const auto & col = block.getByName(column_name); + /// constant expressions have non-null column pointer at this stage + if (const auto is_constexpr = col.column) + { + if (i < group_asts.size() - 1) + group_asts[i] = std::move(group_asts.back()); + + group_asts.pop_back(); + i -= 1; + continue; + } + NameAndTypePair key{column_name, col.type}; aggregation_keys.push_back(key); @@ -145,6 +156,12 @@ void ExpressionAnalyzer::analyzeAggregation() aggregated_columns.push_back(std::move(key)); } } + + if (group_asts.empty()) + { + select_query->group_expression_list = nullptr; + has_aggregation = select_query->having_expression || aggregate_descriptions.size(); + } } for (size_t i = 0; i < aggregate_descriptions.size(); ++i) @@ -426,7 +443,7 @@ void ExpressionAnalyzer::normalizeTreeImpl(ASTPtr & ast, MapOfASTs & finished_as } -void ExpressionAnalyzer::eliminateInjectives() +void ExpressionAnalyzer::optimizeGroupBy() { if (!(select_query && select_query->group_expression_list)) return; @@ -469,7 +486,15 @@ void ExpressionAnalyzer::eliminateInjectives() std::back_inserter(group_exprs), is_literal ); } + else if (is_literal(group_exprs[i])) + { + remove_expr_at_index(i); + i -= 1; + } } + + if (group_exprs.empty()) + select_query->group_expression_list = nullptr; } diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index 359b2fa050e..c63ca92f7ae 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -346,6 +346,10 @@ BlockInputStreamPtr InterpreterSelectQuery::execute() need_second_distinct_pass = streams.size() > 1; } + else if (query.group_by_with_totals && !aggregate_final) + { + executeTotalsAndHaving(streams, false, nullptr, aggregate_overflow_row); + } if (has_order_by) executeOrder(streams); diff --git a/dbms/src/Parsers/ExpressionElementParsers.cpp b/dbms/src/Parsers/ExpressionElementParsers.cpp index 8c5932c773a..9ac5019d12e 100644 --- a/dbms/src/Parsers/ExpressionElementParsers.cpp +++ b/dbms/src/Parsers/ExpressionElementParsers.cpp @@ -77,7 +77,7 @@ bool ParserParenthesisExpression::parseImpl(Pos & pos, Pos end, ASTPtr & node, E /// пустое выражение в скобках недопустимо if (expr_list.children.empty()) { - expected = "not empty list of expressions in parenthesis"; + expected = "non-empty parenthesized list of expressions"; return false; } diff --git a/dbms/src/Parsers/ParserSelectQuery.cpp b/dbms/src/Parsers/ParserSelectQuery.cpp index 9c2bf4ea4a8..a997628372f 100644 --- a/dbms/src/Parsers/ParserSelectQuery.cpp +++ b/dbms/src/Parsers/ParserSelectQuery.cpp @@ -198,18 +198,18 @@ bool ParserSelectQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected & return false; ws.ignore(pos, end); + } - /// WITH TOTALS - if (s_with.ignore(pos, end, expected)) - { - ws.ignore(pos, end); - if (!s_totals.ignore(pos, end, expected)) - return false; + /// WITH TOTALS + if (s_with.ignore(pos, end, expected)) + { + ws.ignore(pos, end); + if (!s_totals.ignore(pos, end, expected)) + return false; - select_query->group_by_with_totals = true; + select_query->group_by_with_totals = true; - ws.ignore(pos, end); - } + ws.ignore(pos, end); } /// HAVING expr diff --git a/dbms/src/Parsers/formatAST.cpp b/dbms/src/Parsers/formatAST.cpp index 1d0c08351e3..050bff00caf 100644 --- a/dbms/src/Parsers/formatAST.cpp +++ b/dbms/src/Parsers/formatAST.cpp @@ -195,11 +195,11 @@ void formatAST(const ASTSelectQuery & ast, std::ostream & s, size_t indent, bo one_line ? formatAST(*ast.group_expression_list, s, indent, hilite, one_line) : formatExpressionListMultiline(typeid_cast(*ast.group_expression_list), s, indent, hilite); - - if (ast.group_by_with_totals) - s << (hilite ? hilite_keyword : "") << nl_or_ws << indent_str << (one_line ? "" : " ") << "WITH TOTALS" << (hilite ? hilite_none : ""); } + if (ast.group_by_with_totals) + s << (hilite ? hilite_keyword : "") << nl_or_ws << indent_str << (one_line ? "" : " ") << "WITH TOTALS" << (hilite ? hilite_none : ""); + if (ast.having_expression) { s << (hilite ? hilite_keyword : "") << nl_or_ws << indent_str << "HAVING " << (hilite ? hilite_none : ""); diff --git a/dbms/src/Server/TCPHandler.cpp b/dbms/src/Server/TCPHandler.cpp index c111853da8e..596a69d8e38 100644 --- a/dbms/src/Server/TCPHandler.cpp +++ b/dbms/src/Server/TCPHandler.cpp @@ -85,7 +85,9 @@ void TCPHandler::runImpl() sendHello(); - connection_context.setProgressCallback(boost::bind(&TCPHandler::updateProgress, this, _1, _2)); + connection_context.setProgressCallback([this] (const size_t rows, const size_t bytes) { + return this->updateProgress(rows, bytes); + }); while (1) { From e8fe836a1ba29083396556e0e3306b7a431011df Mon Sep 17 00:00:00 2001 From: Andrey Mironov Date: Fri, 8 Aug 2014 10:08:17 +0400 Subject: [PATCH 043/127] dbms: more sound solution for backtracking for-loop. [#METR-12108] --- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 9aad597d3b0..1190e1eb175 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -463,13 +463,16 @@ void ExpressionAnalyzer::optimizeGroupBy() }; /// iterate over each GROUP BY expression, eliminate injective function calls and literals - for (size_t i = 0; i < group_exprs.size(); ++i) + for (size_t i = 0; i < group_exprs.size();) { if (const auto function = typeid_cast(group_exprs[i].get())) { /// assert function is injective if (!injectiveFunctionNames.count(function->name)) + { + ++i; continue; + } /// copy shared pointer to args in order to ensure lifetime auto args_ast = function->arguments; @@ -478,7 +481,6 @@ void ExpressionAnalyzer::optimizeGroupBy() * next iteration does not skip not yet processed data */ remove_expr_at_index(i); - i -= 1; /// copy non-literal arguments std::remove_copy_if( @@ -489,7 +491,11 @@ void ExpressionAnalyzer::optimizeGroupBy() else if (is_literal(group_exprs[i])) { remove_expr_at_index(i); - i -= 1; + } + else + { + /// if neither a function nor literal - advance to next expression + ++i; } } From 37298bbe01fcda0135e2cb71e3a7e18cd8c855b1 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 13:04:06 +0400 Subject: [PATCH 044/127] dbms: a bit more efficient virtual columns for some queries. [#METR-2807] --- dbms/src/Common/VirtualColumnUtils.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/dbms/src/Common/VirtualColumnUtils.cpp b/dbms/src/Common/VirtualColumnUtils.cpp index 1f09475a46e..1d5bf3998ee 100644 --- a/dbms/src/Common/VirtualColumnUtils.cpp +++ b/dbms/src/Common/VirtualColumnUtils.cpp @@ -108,18 +108,15 @@ static bool isValidFunction(ASTPtr expression, const NameSet & columns) /// Извлечь все подфункции главной конъюнкции, но зависящие только от заданных столбцов static void extractFunctions(ASTPtr expression, const NameSet & columns, std::vector & result) { - if (const ASTFunction * function = typeid_cast(&* expression)) + const ASTFunction * function = typeid_cast(&* expression); + if (function && function->name == "and") { - if (function->name == "and") - { - for (size_t i = 0; i < function->arguments->children.size(); ++i) - extractFunctions(function->arguments->children[i], columns, result); - } - else - { - if (isValidFunction(expression, columns)) - result.push_back(expression->clone()); - } + for (size_t i = 0; i < function->arguments->children.size(); ++i) + extractFunctions(function->arguments->children[i], columns, result); + } + else if (isValidFunction(expression, columns)) + { + result.push_back(expression->clone()); } } From 336f9f5beb955773dbbf85dcb42dc2ab09a3f5c0 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 13:04:26 +0400 Subject: [PATCH 045/127] zkutil_test: fixed set query. [#METR-2807] --- libs/libzkutil/src/tests/zkutil_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/libzkutil/src/tests/zkutil_test.cpp b/libs/libzkutil/src/tests/zkutil_test.cpp index fc28c1e084b..0693ff93c94 100644 --- a/libs/libzkutil/src/tests/zkutil_test.cpp +++ b/libs/libzkutil/src/tests/zkutil_test.cpp @@ -180,6 +180,8 @@ int main(int argc, char ** argv) DB::assertString("set", in); DB::skipWhitespaceIfAny(in); + DB::assertString(path, in); + DB::skipWhitespaceIfAny(in); readMaybeQuoted(data, in); DB::skipWhitespaceIfAny(in); From 4025face46ffbbd7024860ace60cac620e669751 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 13:58:56 +0400 Subject: [PATCH 046/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 3e24fbe758c..64b80e8fb23 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1044,6 +1044,9 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr } } + /// На месте удаленных кусков могут появиться новые, с другими данными. + context.resetCaches(); + return true; } From 83812f5ed5760b236b4185536884db749a39dd21 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:02:54 +0400 Subject: [PATCH 047/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 64b80e8fb23..a6f359b7cc1 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -174,7 +174,6 @@ void StorageReplicatedMergeTree::createTable() zookeeper->create(zookeeper_path + "/nonincrement_block_numbers", "", zkutil::CreateMode::Persistent); zookeeper->create(zookeeper_path + "/leader_election", "", zkutil::CreateMode::Persistent); zookeeper->create(zookeeper_path + "/temp", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/flags", "", zkutil::CreateMode::Persistent); /// Создадим replicas в последнюю очередь, чтобы нельзя было добавить реплику, пока все остальные ноды не созданы. zookeeper->create(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent); } @@ -242,6 +241,7 @@ void StorageReplicatedMergeTree::createReplica() zookeeper->create(replica_path + "/log_pointer", "", zkutil::CreateMode::Persistent); zookeeper->create(replica_path + "/queue", "", zkutil::CreateMode::Persistent); zookeeper->create(replica_path + "/parts", "", zkutil::CreateMode::Persistent); + zookeeper->create(replica_path + "/flags", "", zkutil::CreateMode::Persistent); /** Нужно изменить данные ноды /replicas на что угодно, чтобы поток, удаляющий старые записи в логе, * споткнулся об это изменение и не удалил записи, которые мы еще не прочитали. From 71b90ea1d46fae5097bec889a29c58b76f5f344c Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 12:28:13 +0400 Subject: [PATCH 048/127] Merge --- dbms/include/DB/Core/ErrorCodes.h | 1 + .../DB/Interpreters/InterpreterAlterQuery.h | 2 +- .../DB/Storages/MergeTree/ActiveDataPartSet.h | 6 +- .../DB/Storages/MergeTree/MergeTreeData.h | 45 +++- .../Storages/MergeTree/MergeTreePartChecker.h | 19 +- .../DB/Storages/StorageReplicatedMergeTree.h | 24 +- .../Storages/MergeTree/ActiveDataPartSet.cpp | 16 +- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 66 ++++- .../MergeTree/MergeTreePartChecker.cpp | 42 ++-- .../ReplicatedMergeTreePartsExchange.cpp | 4 +- .../Storages/StorageReplicatedMergeTree.cpp | 229 +++++++++++++++--- dbms/src/Storages/tests/part_checker.cpp | 10 +- 12 files changed, 372 insertions(+), 92 deletions(-) diff --git a/dbms/include/DB/Core/ErrorCodes.h b/dbms/include/DB/Core/ErrorCodes.h index 73b54c31d5b..4d01ba598a6 100644 --- a/dbms/include/DB/Core/ErrorCodes.h +++ b/dbms/include/DB/Core/ErrorCodes.h @@ -257,6 +257,7 @@ namespace ErrorCodes INCORRECT_MARK, INVALID_PARTITION_NAME, NOT_LEADER, + NOT_ENOUGH_BLOCK_NUMBERS, POCO_EXCEPTION = 1000, STD_EXCEPTION, diff --git a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h index 0b0416f0941..e3f44a0efd8 100644 --- a/dbms/include/DB/Interpreters/InterpreterAlterQuery.h +++ b/dbms/include/DB/Interpreters/InterpreterAlterQuery.h @@ -46,7 +46,7 @@ private: static PartitionCommand attachPartition(const Field & partition, bool unreplicated, bool part) { - return {ATTACH_PARTITION, partition, false, part, unreplicated}; + return {ATTACH_PARTITION, partition, false, unreplicated, part}; } }; diff --git a/dbms/include/DB/Storages/MergeTree/ActiveDataPartSet.h b/dbms/include/DB/Storages/MergeTree/ActiveDataPartSet.h index 9543aa991e9..807087f7da9 100644 --- a/dbms/include/DB/Storages/MergeTree/ActiveDataPartSet.h +++ b/dbms/include/DB/Storages/MergeTree/ActiveDataPartSet.h @@ -63,12 +63,14 @@ public: void add(const String & name); String getContainingPart(const String & name) const; - Strings getParts() const; + Strings getParts() const; /// В порядке возрастания месяца и номера блока. + + size_t size() const; static String getPartName(DayNum_t left_date, DayNum_t right_date, UInt64 left_id, UInt64 right_id, UInt64 level); /// Возвращает true если имя директории совпадает с форматом имени директории кусочков - static bool isPartDirectory(const String & dir_name, Poco::RegularExpression::MatchVec & matches); + static bool isPartDirectory(const String & dir_name, Poco::RegularExpression::MatchVec * out_matches = nullptr); /// Кладет в DataPart данные из имени кусочка. static void parsePartName(const String & file_name, Part & part, const Poco::RegularExpression::MatchVec * matches = nullptr); diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h index ad5521b0e1a..208e51f3939 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h @@ -307,17 +307,22 @@ public: Poco::File(to).remove(true); } - /// Переименовывает кусок, дописав к имени префикс. - void renameAddPrefix(const String & prefix) const + void renameTo(const String & new_name) const { String from = storage.full_path + name + "/"; - String to = storage.full_path + prefix + name + "/"; + String to = storage.full_path + new_name + "/"; Poco::File f(from); f.setLastModified(Poco::Timestamp::fromEpochTime(time(0))); f.renameTo(to); } + /// Переименовывает кусок, дописав к имени префикс. + void renameAddPrefix(const String & prefix) const + { + renameTo(prefix + name); + } + /// Загрузить индекс и вычислить размер. Если size=0, вычислить его тоже. void loadIndex() { @@ -344,12 +349,12 @@ public: } /// Прочитать контрольные суммы, если есть. - void loadChecksums() + void loadChecksums(bool require) { String path = storage.full_path + name + "/checksums.txt"; if (!Poco::File(path).exists()) { - if (storage.require_part_metadata) + if (require) throw Exception("No checksums.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); return; @@ -359,16 +364,21 @@ public: assertEOF(file); } - void loadColumns() + void loadColumns(bool require) { String path = storage.full_path + name + "/columns.txt"; if (!Poco::File(path).exists()) { - if (storage.require_part_metadata) + if (require) throw Exception("No columns.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); - columns = *storage.columns; /// Если нет файла со списком столбцов, запишем его. + for (const NameAndTypePair & column : *storage.columns) + { + if (Poco::File(storage.full_path + name + "/" + escapeForFileName(column.name) + ".bin").exists()) + columns.push_back(column); + } + { WriteBufferFromFile out(path + ".tmp", 4096); columns.writeText(out); @@ -382,7 +392,7 @@ public: columns.readText(file, storage.context.getDataTypeFactory()); } - void checkNotBroken() + void checkNotBroken(bool require_part_metadata) { String path = storage.full_path + name; @@ -391,7 +401,7 @@ public: if (!checksums.files.count("primary.idx")) throw Exception("No checksum for primary.idx", ErrorCodes::NO_FILE_IN_DATA_PART); - if (storage.require_part_metadata) + if (require_part_metadata) { for (const NameAndTypePair & it : columns) { @@ -625,15 +635,23 @@ public: */ DataPartsVector renameTempPartAndReplace(MutableDataPartPtr part, Increment * increment = nullptr, Transaction * out_transaction = nullptr); - /** Убирает из рабочего набора куски remove и добавляет куски add. + /** Убирает из рабочего набора куски remove и добавляет куски add. add должны уже быть в all_data_parts. * Если clear_without_timeout, данные будут удалены при следующем clearOldParts, игнорируя old_parts_lifetime. */ void replaceParts(const DataPartsVector & remove, const DataPartsVector & add, bool clear_without_timeout); + /** Добавляет новый кусок в список известных кусков и в рабочий набор. + */ + void attachPart(DataPartPtr part); + /** Переименовывает кусок в detached/prefix_кусок и забывает про него. Данные не будут удалены в clearOldParts. * Если restore_covered, добавляет в рабочий набор неактивные куски, слиянием которых получен удаляемый кусок. */ - void renameAndDetachPart(DataPartPtr part, const String & prefix = "", bool restore_covered = false); + void renameAndDetachPart(DataPartPtr part, const String & prefix = "", bool restore_covered = false, bool move_to_detached = true); + + /** Убирает кусок из списка кусков (включая all_data_parts), но не перемещщает директорию. + */ + void detachPartInPlace(DataPartPtr part); /** Возвращает старые неактуальные куски, которые можно удалить. Одновременно удаляет их из списка кусков, но не с диска. */ @@ -685,6 +703,9 @@ public: ExpressionActionsPtr getPrimaryExpression() const { return primary_expr; } SortDescription getSortDescription() const { return sort_descr; } + /// Проверить, что кусок не сломан и посчитать для него чексуммы, если их нет. + MutableDataPartPtr loadPartAndFixMetadata(const String & relative_path); + const Context & context; const String date_column_name; const ASTPtr sampling_expression; diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h b/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h index 4490cd9ebdb..417ddd6d7f8 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h @@ -9,16 +9,27 @@ namespace DB class MergeTreePartChecker { public: + struct Settings + { + bool verbose = false; /// Пишет в stderr прогресс и ошибки, и не останавливается при первой ошибке. + bool require_checksums = false; /// Требует, чтобы был columns.txt. + bool require_column_files = false; /// Требует, чтобы для всех столбцов из columns.txt были файлы. + size_t index_granularity = 8192; + + Settings & setVerbose(bool verbose_) { verbose = verbose_; return *this; } + Settings & setRequireChecksums(bool require_checksums_) { require_checksums = require_checksums_; return *this; } + Settings & setRequireColumnFiles(bool require_column_files_) { require_column_files = require_column_files_; return *this; } + Settings & setIndexGranularity(bool index_granularity_) { index_granularity = index_granularity_; return *this; } + }; + /** Полностью проверяет данные кусочка: * - Вычисляет контрольные суммы и сравнивает с checksums.txt. * - Для массивов и строк проверяет соответствие размеров и количества данных. * - Проверяет правильность засечек. * Бросает исключение, если кусок испорчен или если проверить не получилось (TODO: можно попробовать разделить эти случаи). - * Если strict, требует, чтобы для всех столбцов из columns.txt были файлы. - * Если verbose, пишет в stderr прогресс и ошибки, и не останавливается при первой ошибке. */ - static void checkDataPart(String path, size_t index_granularity, bool strict, const DataTypeFactory & data_type_factory, - bool verbose = false); + static void checkDataPart(String path, const Settings & settings, const DataTypeFactory & data_type_factory, + MergeTreeData::DataPart::Checksums * out_checksums = nullptr); }; } diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 38bc341e68c..9c2d040fa1d 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -137,6 +137,7 @@ private: GET_PART, /// Получить кусок с другой реплики. MERGE_PARTS, /// Слить куски. DROP_RANGE, /// Удалить куски в указанном месяце в указанном диапазоне номеров. + ATTACH_PART, /// Перенести кусок из директории detached или unreplicated. }; String znode_name; @@ -144,11 +145,19 @@ private: Type type; String source_replica; /// Пустая строка значит, что эта запись была добавлена сразу в очередь, а не скопирована из лога. - String new_part_name; /// Для DROP_RANGE имя несуществующего куска. Нужно удалить все куски, покрытые им. + /// Имя куска, получающегося в результате. + /// Для DROP_RANGE имя несуществующего куска. Нужно удалить все куски, покрытые им. + String new_part_name; Strings parts_to_merge; - bool detach = false; /// Для DROP_RANGE, true значит, что куски нужно не удалить, а перенести в директорию detached. + /// Для DROP_RANGE, true значит, что куски нужно не удалить, а перенести в директорию detached. + bool detach = false; + + /// Для ATTACH_PART имя куска в директории detached или unreplicated. + String source_part_name; + /// Нужно переносить из директории unreplicated, а не detached. + bool attach_unreplicated; FuturePartTaggerPtr future_part_tagger; bool currently_executing = false; /// Доступ под queue_mutex. @@ -156,13 +165,13 @@ private: void addResultToVirtualParts(StorageReplicatedMergeTree & storage) { - if (type == MERGE_PARTS || type == GET_PART || type == DROP_RANGE) + if (type == MERGE_PARTS || type == GET_PART || type == DROP_RANGE || type == ATTACH_PART) storage.virtual_parts.add(new_part_name); } void tagPartAsFuture(StorageReplicatedMergeTree & storage) { - if (type == MERGE_PARTS || type == GET_PART) + if (type == MERGE_PARTS || type == GET_PART || type == ATTACH_PART) future_part_tagger = new FuturePartTagger(new_part_name, storage); } @@ -362,7 +371,7 @@ private: * Кладет в ops действия, добавляющие данные о куске в ZooKeeper. * Вызывать под TableStructureLock. */ - void checkPartAndAddToZooKeeper(MergeTreeData::DataPartPtr part, zkutil::Ops & ops); + void checkPartAndAddToZooKeeper(MergeTreeData::DataPartPtr part, zkutil::Ops & ops, String name_override = ""); /// Убирает кусок из ZooKeeper и добавляет в очередь задание скачать его. Предполагается это делать с битыми кусками. void removePartAndEnqueueFetch(const String & part_name); @@ -396,7 +405,8 @@ private: */ bool executeLogEntry(const LogEntry & entry, BackgroundProcessingPool::Context & pool_context); - bool executeDropRange(const LogEntry & entry); + void executeDropRange(const LogEntry & entry); + bool executeAttachPart(const LogEntry & entry); /// Возвращает false, если куска нет, и его нужно забрать с другой реплики. /** Обновляет очередь. */ @@ -450,7 +460,7 @@ private: /** Дождаться, пока все реплики, включая эту, выполнят указанное действие из лога. * Если одновременно с этим добавляются реплики, может не дождаться добавленную реплику. */ - void waitForAllReplicasToProcessLogEntry(const String & log_znode_path, const LogEntry & entry); + void waitForAllReplicasToProcessLogEntry(const LogEntry & entry); }; } diff --git a/dbms/src/Storages/MergeTree/ActiveDataPartSet.cpp b/dbms/src/Storages/MergeTree/ActiveDataPartSet.cpp index 8f6bd3f8eef..b8dbc8f4165 100644 --- a/dbms/src/Storages/MergeTree/ActiveDataPartSet.cpp +++ b/dbms/src/Storages/MergeTree/ActiveDataPartSet.cpp @@ -82,6 +82,12 @@ Strings ActiveDataPartSet::getParts() const return res; } +size_t ActiveDataPartSet::size() const +{ + Poco::ScopedLock lock(mutex); + return parts.size(); +} + String ActiveDataPartSet::getPartName(DayNum_t left_date, DayNum_t right_date, UInt64 left_id, UInt64 right_id, UInt64 level) @@ -110,10 +116,14 @@ String ActiveDataPartSet::getPartName(DayNum_t left_date, DayNum_t right_date, U return res; } -bool ActiveDataPartSet::isPartDirectory(const String & dir_name, Poco::RegularExpression::MatchVec & matches) +bool ActiveDataPartSet::isPartDirectory(const String & dir_name, Poco::RegularExpression::MatchVec * out_matches) { + Poco::RegularExpression::MatchVec matches; static Poco::RegularExpression file_name_regexp("^(\\d{8})_(\\d{8})_(\\d+)_(\\d+)_(\\d+)"); - return (file_name_regexp.match(dir_name, 0, matches) && 6 == matches.size()); + bool res = (file_name_regexp.match(dir_name, 0, matches) && 6 == matches.size()); + if (out_matches) + *out_matches = matches; + return res; } void ActiveDataPartSet::parsePartName(const String & file_name, Part & part, const Poco::RegularExpression::MatchVec * matches_p) @@ -121,7 +131,7 @@ void ActiveDataPartSet::parsePartName(const String & file_name, Part & part, con Poco::RegularExpression::MatchVec match_vec; if (!matches_p) { - if (!isPartDirectory(file_name, match_vec)) + if (!isPartDirectory(file_name, &match_vec)) throw Exception("Unexpected part name: " + file_name, ErrorCodes::BAD_DATA_PART_NAME); matches_p = &match_vec; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index c7eced0b86c..55d0805e65a 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -124,7 +125,7 @@ void MergeTreeData::loadDataParts() Poco::RegularExpression::MatchVec matches; for (const String & file_name : part_file_names) { - if (!ActiveDataPartSet::isPartDirectory(file_name, matches)) + if (!ActiveDataPartSet::isPartDirectory(file_name, &matches)) continue; MutableDataPartPtr part = std::make_shared(*this); @@ -135,10 +136,10 @@ void MergeTreeData::loadDataParts() try { - part->loadColumns(); - part->loadChecksums(); + part->loadColumns(require_part_metadata); + part->loadChecksums(require_part_metadata); part->loadIndex(); - part->checkNotBroken(); + part->checkNotBroken(require_part_metadata); } catch (...) { @@ -167,7 +168,7 @@ void MergeTreeData::loadDataParts() { if (contained_name == file_name) continue; - if (!ActiveDataPartSet::isPartDirectory(contained_name, matches)) + if (!ActiveDataPartSet::isPartDirectory(contained_name, &matches)) continue; DataPart contained_part(*this); ActiveDataPartSet::parsePartName(contained_name, contained_part, &matches); @@ -720,7 +721,17 @@ void MergeTreeData::replaceParts(const DataPartsVector & remove, const DataParts } } -void MergeTreeData::renameAndDetachPart(DataPartPtr part, const String & prefix, bool restore_covered) +void MergeTreeData::attachPart(DataPartPtr part) +{ + Poco::ScopedLock lock(data_parts_mutex); + Poco::ScopedLock lock_all(all_data_parts_mutex); + + if (!all_data_parts.insert(part).second) + throw Exception("Part " + part->name + " is already attached", ErrorCodes::DUPLICATE_DATA_PART); + data_parts.insert(part); +} + +void MergeTreeData::renameAndDetachPart(DataPartPtr part, const String & prefix, bool restore_covered, bool move_to_detached) { LOG_INFO(log, "Renaming " << part->name << " to " << prefix << part->name << " and detaching it."); @@ -731,7 +742,8 @@ void MergeTreeData::renameAndDetachPart(DataPartPtr part, const String & prefix, throw Exception("No such data part", ErrorCodes::NO_SUCH_DATA_PART); data_parts.erase(part); - part->renameAddPrefix("detached/" + prefix); + if (move_to_detached || !prefix.empty()) + part->renameAddPrefix((move_to_detached ? "detached/" : "") + prefix); if (restore_covered) { @@ -783,6 +795,11 @@ void MergeTreeData::renameAndDetachPart(DataPartPtr part, const String & prefix, } } +void MergeTreeData::detachPartInPlace(DataPartPtr part) +{ + renameAndDetachPart(part, "", false, false); +} + MergeTreeData::DataParts MergeTreeData::getDataParts() { Poco::ScopedLock lock(data_parts_mutex); @@ -879,6 +896,41 @@ MergeTreeData::DataPartPtr MergeTreeData::getPartIfExists(const String & part_na return nullptr; } +MergeTreeData::MutableDataPartPtr MergeTreeData::loadPartAndFixMetadata(const String & relative_path) +{ + MutableDataPartPtr part = std::make_shared(*this); + part->name = relative_path; + + /// Раньше список столбцов записывался неправильно. Удалим его и создадим заново. + if (Poco::File(full_path + relative_path + "/columns.txt").exists()) + Poco::File(full_path + relative_path + "/columns.txt").remove(); + + part->loadColumns(false); + part->loadChecksums(false); + part->loadIndex(); + part->checkNotBroken(false); + + part->modification_time = Poco::File(full_path + relative_path).getLastModified().epochTime(); + + /// Если нет файла с чексуммами, посчитаем чексуммы и запишем. Заодно проверим данные. + if (part->checksums.empty()) + { + MergeTreePartChecker::Settings settings; + settings.setIndexGranularity(index_granularity); + settings.setRequireColumnFiles(true); + MergeTreePartChecker::checkDataPart(full_path + relative_path, settings, context.getDataTypeFactory(), &part->checksums); + + { + WriteBufferFromFile out(full_path + relative_path + "/checksums.txt.tmp", 4096); + part->checksums.writeText(out); + } + + Poco::File(full_path + relative_path + "/checksums.txt.tmp").renameTo(full_path + relative_path + "/checksums.txt"); + } + + return part; +} + void MergeTreeData::DataPart::Checksums::Checksum::checkEqual(const Checksum & rhs, bool have_uncompressed, const String & name) const { diff --git a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp index 8dc761b26a4..6382f2624bf 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp @@ -97,7 +97,7 @@ struct Stream return size / sizeof(UInt64); } - void assertMark(bool strict) + void assertMark() { MarkInCompressedFile mrk_mark; readIntBinary(mrk_mark.offset_in_compressed_file, mrk_hashing_buf); @@ -152,7 +152,7 @@ struct Stream }; /// Возвращает количество строк. Добавляет в checksums чексуммы всех файлов столбца. -static size_t checkColumn(const String & path, const String & name, DataTypePtr type, size_t index_granularity, bool strict, +static size_t checkColumn(const String & path, const String & name, DataTypePtr type, const MergeTreePartChecker::Settings & settings, MergeTreeData::DataPart::Checksums & checksums) { size_t rows = 0; @@ -171,10 +171,10 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr if (sizes_stream.marksEOF()) break; - sizes_stream.assertMark(strict); - data_stream.assertMark(strict); + sizes_stream.assertMark(); + data_stream.assertMark(); - size_t cur_rows = sizes_stream.readUInt64(index_granularity, sizes); + size_t cur_rows = sizes_stream.readUInt64(settings.index_granularity, sizes); size_t sum = 0; for (size_t i = 0; i < cur_rows; ++i) @@ -188,7 +188,7 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr data_stream.read(sum); rows += cur_rows; - if (cur_rows < index_granularity) + if (cur_rows < settings.index_granularity) break; } @@ -207,12 +207,12 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr if (data_stream.marksEOF()) break; - data_stream.assertMark(strict); + data_stream.assertMark(); - size_t cur_rows = data_stream.read(index_granularity); + size_t cur_rows = data_stream.read(settings.index_granularity); rows += cur_rows; - if (cur_rows < index_granularity) + if (cur_rows < settings.index_granularity) break; } @@ -228,8 +228,8 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr } } -void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, bool strict, const DataTypeFactory & data_type_factory, - bool verbose) +void MergeTreePartChecker::checkDataPart(String path, const Settings & settings, const DataTypeFactory & data_type_factory, + MergeTreeData::DataPart::Checksums * out_checksums) { if (!path.empty() && *path.rbegin() != '/') path += "/"; @@ -243,7 +243,7 @@ void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, assertEOF(buf); } - if (strict || Poco::File(path + "checksums.txt").exists()) + if (settings.require_checksums || Poco::File(path + "checksums.txt").exists()) { ReadBufferFromFile buf(path + "checksums.txt"); checksums_txt.readText(buf); @@ -266,7 +266,7 @@ void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, for (const NameAndTypePair & column : columns) { - if (verbose) + if (settings.verbose) { std::cerr << column.name << ":"; std::cerr.flush(); @@ -275,14 +275,14 @@ void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, bool ok = false; try { - if (!strict && !Poco::File(path + escapeForFileName(column.name) + ".bin").exists()) + if (!settings.require_column_files && !Poco::File(path + escapeForFileName(column.name) + ".bin").exists()) { - if (verbose) + if (settings.verbose) std::cerr << " no files" << std::endl; continue; } - size_t cur_rows = checkColumn(path, column.name, column.type, index_granularity, strict, checksums_data); + size_t cur_rows = checkColumn(path, column.name, column.type, settings, checksums_data); if (first) { rows = cur_rows; @@ -298,7 +298,7 @@ void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, } catch (...) { - if (!verbose) + if (!settings.verbose) throw; ExceptionPtr e = cloneCurrentException(); if (!first_exception) @@ -311,18 +311,18 @@ void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, std::cerr << std::endl; } - if (verbose && ok) + if (settings.verbose && ok) std::cerr << " ok" << std::endl; } if (first) throw Exception("No columns", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - if (primary_idx_size % ((rows - 1) / index_granularity + 1)) + if (primary_idx_size % ((rows - 1) / settings.index_granularity + 1)) throw Exception("primary.idx size (" + toString(primary_idx_size) + ") not divisible by number of marks (" - + toString(rows) + "/" + toString(index_granularity) + " rounded up)", ErrorCodes::CORRUPTED_DATA); + + toString(rows) + "/" + toString(settings.index_granularity) + " rounded up)", ErrorCodes::CORRUPTED_DATA); - if (strict || !checksums_txt.files.empty()) + if (settings.require_checksums || !checksums_txt.files.empty()) checksums_txt.checkEqual(checksums_data, true); if (first_exception) diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp index da860c75357..e97ebc7381c 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp @@ -110,8 +110,8 @@ MergeTreeData::MutableDataPartPtr ReplicatedMergeTreePartsFetcher::fetchPart( ActiveDataPartSet::parsePartName(part_name, *new_data_part); new_data_part->modification_time = time(0); - new_data_part->loadColumns(); - new_data_part->loadChecksums(); + new_data_part->loadColumns(true); + new_data_part->loadChecksums(true); new_data_part->loadIndex(); new_data_part->checksums.checkEqual(checksums, false); diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index a6f359b7cc1..c79a52ec11a 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -17,6 +17,8 @@ const auto ERROR_SLEEP_MS = 1000; const auto MERGE_SELECTING_SLEEP_MS = 5 * 1000; const auto CLEANUP_SLEEP_MS = 30 * 1000; +const auto RESERVED_BLOCK_NUMBERS = 200; + /// Преобразовать число в строку формате суффиксов автоинкрементных нод в ZooKeeper. static String padIndex(UInt64 index) { @@ -401,8 +403,8 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) bool insane = parts_to_add.size() > 2 || unexpected_parts.size() > 2 || - expected_parts.size() > 20 || - parts_to_fetch.size() > 2; + expected_parts.size() > 5 || + parts_to_fetch.size() > 30; if (insane && !skip_sanity_checks) { @@ -475,8 +477,11 @@ void StorageReplicatedMergeTree::initVirtualParts() } } -void StorageReplicatedMergeTree::checkPartAndAddToZooKeeper(MergeTreeData::DataPartPtr part, zkutil::Ops & ops) +void StorageReplicatedMergeTree::checkPartAndAddToZooKeeper(MergeTreeData::DataPartPtr part, zkutil::Ops & ops, String part_name) { + if (part_name.empty()) + part_name = part->name; + check(part->columns); int expected_columns_version = columns_version; @@ -488,22 +493,22 @@ void StorageReplicatedMergeTree::checkPartAndAddToZooKeeper(MergeTreeData::DataP { zkutil::Stat stat_before, stat_after; String columns_str; - if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/parts/" + part->name + "/columns", columns_str, &stat_before)) + if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/parts/" + part_name + "/columns", columns_str, &stat_before)) continue; if (columns_str != expected_columns_str) { - LOG_INFO(log, "Not checking checksums of part " << part->name << " with replica " << replica + LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica << " because columns are different"); continue; } String checksums_str; /// Проверим, что версия ноды со столбцами не изменилась, пока мы читали checksums. /// Это гарантирует, что столбцы и чексуммы относятся к одним и тем же данным. - if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/parts/" + part->name + "/checksums", checksums_str) || - !zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/parts/" + part->name + "/columns", &stat_after) || + if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/parts/" + part_name + "/checksums", checksums_str) || + !zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/parts/" + part_name + "/columns", &stat_after) || stat_before.version != stat_after.version) { - LOG_INFO(log, "Not checking checksums of part " << part->name << " with replica " << replica + LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica << " because part changed while we were reading its checksums"); continue; } @@ -512,9 +517,9 @@ void StorageReplicatedMergeTree::checkPartAndAddToZooKeeper(MergeTreeData::DataP checksums.checkEqual(part->checksums, true); } - if (zookeeper->exists(replica_path + "/parts/" + part->name)) + if (zookeeper->exists(replica_path + "/parts/" + part_name)) { - LOG_ERROR(log, "checkPartAndAddToZooKeeper: node " << replica_path + "/parts/" + part->name << " already exists"); + LOG_ERROR(log, "checkPartAndAddToZooKeeper: node " << replica_path + "/parts/" + part_name << " already exists"); return; } @@ -522,17 +527,17 @@ void StorageReplicatedMergeTree::checkPartAndAddToZooKeeper(MergeTreeData::DataP zookeeper_path + "/columns", expected_columns_version)); ops.push_back(new zkutil::Op::Create( - replica_path + "/parts/" + part->name, + replica_path + "/parts/" + part_name, "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); ops.push_back(new zkutil::Op::Create( - replica_path + "/parts/" + part->name + "/columns", + replica_path + "/parts/" + part_name + "/columns", part->columns.toString(), zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); ops.push_back(new zkutil::Op::Create( - replica_path + "/parts/" + part->name + "/checksums", + replica_path + "/parts/" + part_name + "/checksums", part->checksums.toString(), zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); @@ -749,7 +754,8 @@ void StorageReplicatedMergeTree::pullLogsToQueue(zkutil::EventPtr next_update_ev bool StorageReplicatedMergeTree::shouldExecuteLogEntry(const LogEntry & entry) { - if ((entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::GET_PART) && future_parts.count(entry.new_part_name)) + if ((entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::GET_PART || entry.type == LogEntry::ATTACH_PART) + && future_parts.count(entry.new_part_name)) { LOG_DEBUG(log, "Not executing log entry for part " << entry.new_part_name << " because another log entry for the same part is being processed. This shouldn't happen often."); @@ -779,10 +785,14 @@ bool StorageReplicatedMergeTree::shouldExecuteLogEntry(const LogEntry & entry) bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, BackgroundProcessingPool::Context & pool_context) { if (entry.type == LogEntry::DROP_RANGE) - return executeDropRange(entry); + { + executeDropRange(entry); + return true; + } if (entry.type == LogEntry::GET_PART || - entry.type == LogEntry::MERGE_PARTS) + entry.type == LogEntry::MERGE_PARTS || + entry.type == LogEntry::ATTACH_PART) { /// Если у нас уже есть этот кусок или покрывающий его кусок, ничего делать не нужно. MergeTreeData::DataPartPtr containing_part = data.getActiveContainingPart(entry.new_part_name); @@ -805,6 +815,10 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro { do_fetch = true; } + else if (entry.type == LogEntry::ATTACH_PART) + { + do_fetch = !executeAttachPart(entry); + } else if (entry.type == LogEntry::MERGE_PARTS) { MergeTreeData::DataPartsVector parts; @@ -959,7 +973,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro return true; } -bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTree::LogEntry & entry) +void StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTree::LogEntry & entry) { LOG_INFO(log, (entry.detach ? "Detaching" : "Removing") << " parts inside " << entry.new_part_name << "."); @@ -1019,7 +1033,7 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr /// Если кусок нужно удалить, надежнее удалить директорию после изменений в ZooKeeper. if (!entry.detach) - data.replaceParts({part}, {}, false); + data.replaceParts({part}, {}, true); } LOG_INFO(log, (entry.detach ? "Detached " : "Removed ") << removed_parts << " parts inside " << entry.new_part_name << "."); @@ -1043,6 +1057,45 @@ bool StorageReplicatedMergeTree::executeDropRange(const StorageReplicatedMergeTr unreplicated_data->replaceParts({part}, {}, false); } } +} + +bool StorageReplicatedMergeTree::executeAttachPart(const StorageReplicatedMergeTree::LogEntry & entry) +{ + String source_path = (entry.attach_unreplicated ? "unreplicated/" : "detached/") + entry.source_part_name; + + LOG_INFO(log, "Attaching part " << entry.source_part_name << " from " << source_path << " as " << entry.new_part_name); + + if (!Poco::File(data.getFullPath() + source_path).isDirectory()) + { + LOG_INFO(log, "No such directory. Will fetch " << entry.new_part_name << " instead"); + return false; + } + + LOG_DEBUG(log, "Checking data"); + MergeTreeData::MutableDataPartPtr part = data.loadPartAndFixMetadata(source_path); + + zkutil::Ops ops; + checkPartAndAddToZooKeeper(part, ops, entry.new_part_name); + + if (entry.attach_unreplicated && unreplicated_data) + { + MergeTreeData::DataPartPtr unreplicated_part = unreplicated_data->getPartIfExists(entry.source_part_name); + if (unreplicated_part) + unreplicated_data->detachPartInPlace(unreplicated_part); + else + LOG_WARNING(log, "Unreplicated part " << entry.source_part_name << " is already detached"); + } + + zookeeper->multi(ops); + + /// NOTE: Не можем использовать renameTempPartAndAdd, потому что кусок не временный - если что-то пойдет не так, его не нужно удалять. + part->renameTo(entry.new_part_name); + part->name = entry.new_part_name; + ActiveDataPartSet::parsePartName(part->name, *part); + + data.attachPart(part); + + LOG_INFO(log, "Finished attaching part " << entry.new_part_name); /// На месте удаленных кусков могут появиться новые, с другими данными. context.resetCaches(); @@ -1625,8 +1678,12 @@ void StorageReplicatedMergeTree::partCheckThread() if (part->columns != zk_columns) throw Exception("Columns of local part " + part_name + " are different from ZooKeeper"); + MergeTreePartChecker::Settings settings; + settings.setIndexGranularity(data.index_granularity); + settings.setRequireChecksums(true); + settings.setRequireColumnFiles(true); MergeTreePartChecker::checkDataPart( - data.getFullPath() + part_name, data.index_granularity, true, context.getDataTypeFactory()); + data.getFullPath() + part_name, settings, context.getDataTypeFactory()); LOG_INFO(log, "Part " << part_name << " looks good."); } @@ -2135,12 +2192,7 @@ static String getFakePartNameForDrop(const String & month_name, UInt64 left, UIn void StorageReplicatedMergeTree::dropPartition(const Field & field, bool detach) { - String month_name; - - if (field.getType() == Field::Types::UInt64) - month_name = toString(field.get()); - else - month_name = field.safeGet(); + String month_name = field.getType() == Field::Types::UInt64 ? toString(field.get()) : field.safeGet(); if (!isValidMonthName(month_name)) throw Exception("Invalid partition format: " + month_name + ". Partition should consist of 6 digits: YYYYMM", @@ -2191,12 +2243,101 @@ void StorageReplicatedMergeTree::dropPartition(const Field & field, bool detach) entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1); /// Дождемся, пока все реплики выполнят дроп. - waitForAllReplicasToProcessLogEntry(log_znode_path, entry); + waitForAllReplicasToProcessLogEntry(entry); } -void StorageReplicatedMergeTree::attachPartition(const Field& partition, bool unreplicated, bool part) +void StorageReplicatedMergeTree::attachPartition(const Field & field, bool unreplicated, bool attach_part) { - throw Exception("Not implemented", ErrorCodes::NOT_IMPLEMENTED); + String partition = field.getType() == Field::Types::UInt64 ? toString(field.get()) : field.safeGet(); + + if (!attach_part && !isValidMonthName(partition)) + throw Exception("Invalid partition format: " + partition + ". Partition should consist of 6 digits: YYYYMM", + ErrorCodes::INVALID_PARTITION_NAME); + + String source_dir = (unreplicated ? "unreplicated/" : "detached/"); + + /// Составим список кусков, которые нужно добавить. + Strings parts; + if (attach_part) + { + parts.push_back(partition); + } + else + { + LOG_DEBUG(log, "Looking for parts for partition " << partition << " in " << source_dir); + ActiveDataPartSet active_parts; + for (Poco::DirectoryIterator it = Poco::DirectoryIterator(full_path + source_dir); it != Poco::DirectoryIterator(); ++it) + { + String name = it.name(); + if (!ActiveDataPartSet::isPartDirectory(name)) + continue; + if (name.substr(0, partition.size()) != partition) + continue; + LOG_DEBUG(log, "Found part " << name); + active_parts.add(name); + } + LOG_DEBUG(log, active_parts.size() << " of them are active"); + parts = active_parts.getParts(); + } + + /// Синхронно проверим, что добавляемые куски существуют и не испорчены хотя бы на этой реплике. Запишем checksums.txt, если его нет. + LOG_DEBUG(log, "Checking parts"); + for (const String & part : parts) + { + LOG_DEBUG(log, "Checking part " << part); + data.loadPartAndFixMetadata(source_dir + part); + } + + /// Выделим добавляемым кускам максимальные свободные номера, меньшие RESERVED_BLOCK_NUMBERS. + /// NOTE: Проверка свободности номеров никак не синхронизируется. Выполнять несколько запросов ATTACH/DETACH/DROP одновременно нельзя. + UInt64 min_used_number = RESERVED_BLOCK_NUMBERS; + + { + auto existing_parts = data.getDataParts(); + for (const auto & part : existing_parts) + { + min_used_number = std::min(min_used_number, part->left); + } + } + + if (parts.size() > min_used_number) + throw Exception("Not enough free small block numbers for attaching parts: " + + toString(parts.size()) + " needed, " + toString(min_used_number) + " available", ErrorCodes::NOT_ENOUGH_BLOCK_NUMBERS); + + /// Добавим записи в лог. + std::reverse(parts.begin(), parts.end()); + std::list entries; + zkutil::Ops ops; + for (const String & part_name : parts) + { + ActiveDataPartSet::Part part; + ActiveDataPartSet::parsePartName(part_name, part); + part.left = part.right = --min_used_number; + String new_part_name = ActiveDataPartSet::getPartName(part.left_date, part.right_date, part.left, part.right, part.level); + + LOG_INFO(log, "Will attach " << part_name << " as " << new_part_name); + + entries.emplace_back(); + LogEntry & entry = entries.back(); + entry.type = LogEntry::ATTACH_PART; + entry.source_replica = replica_name; + entry.source_part_name = part_name; + entry.new_part_name = new_part_name; + entry.attach_unreplicated = unreplicated; + ops.push_back(new zkutil::Op::Create( + zookeeper_path + "/log/log-", entry.toString(), zookeeper->getDefaultACL(), zkutil::CreateMode::PersistentSequential)); + } + + LOG_DEBUG(log, "Adding attaches to log"); + zookeeper->multi(ops); + size_t i = 0; + for (LogEntry & entry : entries) + { + String log_znode_path = dynamic_cast(ops[i++]).getPathCreated(); + entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1); + + waitForAllReplicasToProcessLogEntry(entry); + } } void StorageReplicatedMergeTree::drop() @@ -2246,7 +2387,7 @@ AbandonableLockInZooKeeper StorageReplicatedMergeTree::allocateBlockNumber(const zkutil::Ops ops; auto acl = zookeeper->getDefaultACL(); ops.push_back(new zkutil::Op::Create(month_path, "", acl, zkutil::CreateMode::Persistent)); - for (size_t i = 0; i < 200; ++i) + for (size_t i = 0; i < RESERVED_BLOCK_NUMBERS; ++i) { ops.push_back(new zkutil::Op::Create(month_path + "/skip_increment", "", acl, zkutil::CreateMode::Persistent)); ops.push_back(new zkutil::Op::Remove(month_path + "/skip_increment", -1)); @@ -2260,11 +2401,11 @@ AbandonableLockInZooKeeper StorageReplicatedMergeTree::allocateBlockNumber(const zookeeper_path + "/temp", *zookeeper); } -void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const String & log_znode_path, const LogEntry & entry) +void StorageReplicatedMergeTree::waitForAllReplicasToProcessLogEntry(const LogEntry & entry) { LOG_DEBUG(log, "Waiting for all replicas to process " << entry.znode_name); - UInt64 log_index = parse(log_znode_path.substr(log_znode_path.size() - 10)); + UInt64 log_index = parse(entry.znode_name.substr(entry.znode_name.size() - 10)); String log_entry_str = entry.toString(); Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); @@ -2354,6 +2495,16 @@ void StorageReplicatedMergeTree::LogEntry::writeText(WriteBuffer & out) const writeString("drop\n", out); writeString(new_part_name, out); break; + case ATTACH_PART: + writeString("attach\n", out); + if (attach_unreplicated) + writeString("unreplicated\n", out); + else + writeString("detached\n", out); + writeString(source_part_name, out); + writeString("\ninto\n", out); + writeString(new_part_name, out); + break; } writeString("\n", out); } @@ -2394,6 +2545,22 @@ void StorageReplicatedMergeTree::LogEntry::readText(ReadBuffer & in) detach = type_str == "detach"; readString(new_part_name, in); } + else if (type_str == "attach") + { + type = ATTACH_PART; + String source_type; + readString(source_type, in); + if (source_type == "unreplicated") + attach_unreplicated = true; + else if (source_type == "detached") + attach_unreplicated = false; + else + throw Exception("Bad format: expected 'unreplicated' or 'detached', found '" + source_type + "'", ErrorCodes::CANNOT_PARSE_TEXT); + assertString("\n", in); + readString(source_part_name, in); + assertString("\ninto\n", in); + readString(new_part_name, in); + } assertString("\n", in); } diff --git a/dbms/src/Storages/tests/part_checker.cpp b/dbms/src/Storages/tests/part_checker.cpp index be19a4c494f..d5376b23c37 100644 --- a/dbms/src/Storages/tests/part_checker.cpp +++ b/dbms/src/Storages/tests/part_checker.cpp @@ -15,8 +15,14 @@ int main(int argc, char ** argv) try { - DB::MergeTreePartChecker::checkDataPart(argv[1], argc == 4 ? DB::parse(argv[3]) : 8192ul, argv[2][0] == '1', - DB::DataTypeFactory(), true); + DB::MergeTreePartChecker::Settings settings; + if (argc == 4) + settings.setIndexGranularity(DB::parse(argv[3])); + settings.setRequireChecksums(argv[2][0] == '1'); + settings.setRequireColumnFiles(argv[2][0] == '1'); + settings.setVerbose(true); + + DB::MergeTreePartChecker::checkDataPart(argv[1], settings, DB::DataTypeFactory()); } catch (...) { From 08340ff6a0072ba8f92a6bf17ed899db607aa2fd Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:28:46 +0400 Subject: [PATCH 049/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index c79a52ec11a..0b2a83cbb74 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -1065,9 +1065,9 @@ bool StorageReplicatedMergeTree::executeAttachPart(const StorageReplicatedMergeT LOG_INFO(log, "Attaching part " << entry.source_part_name << " from " << source_path << " as " << entry.new_part_name); - if (!Poco::File(data.getFullPath() + source_path).isDirectory()) + if (!Poco::File(data.getFullPath() + source_path).exists()) { - LOG_INFO(log, "No such directory. Will fetch " << entry.new_part_name << " instead"); + LOG_INFO(log, "No part at " << source_path << ". Will fetch it instead"); return false; } From dbe4dc64f38d107769509d914db99ea474c929c1 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:41:53 +0400 Subject: [PATCH 050/127] Merge --- .../MergeTree/ReplicatedMergeTreeBlockOutputStream.h | 11 ++++++----- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 10 +++++++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h b/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h index a6ac85af4aa..4b16aa00d56 100644 --- a/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h +++ b/dbms/include/DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h @@ -33,6 +33,7 @@ public: UInt64 part_number = block_number_lock.getNumber(); MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, part_number); + String part_name = ActiveDataPartSet::getPartName(part->left_date, part->right_date, part->left, part->right, part->level); /// Если в запросе не указан ID, возьмем в качестве ID хеш от данных. То есть, не вставляем одинаковые данные дважды. /// NOTE: Если такая дедупликация не нужна, можно вместо этого оставлять block_id пустым. @@ -42,13 +43,10 @@ public: LOG_DEBUG(log, "Wrote block " << part_number << " with ID " << block_id << ", " << current_block.block.rows() << " rows"); - MergeTreeData::Transaction transaction; /// Если не получится добавить кусок в ZK, снова уберем его из рабочего набора. - storage.data.renameTempPartAndAdd(part, nullptr, &transaction); - StorageReplicatedMergeTree::LogEntry log_entry; log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; log_entry.source_replica = storage.replica_name; - log_entry.new_part_name = part->name; + log_entry.new_part_name = part_name; /// Одновременно добавим информацию о куске во все нужные места в ZooKeeper и снимем block_number_lock. zkutil::Ops ops; @@ -75,7 +73,7 @@ public: storage.zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); } - storage.checkPartAndAddToZooKeeper(part, ops); + storage.checkPartAndAddToZooKeeper(part, ops, part_name); ops.push_back(new zkutil::Op::Create( storage.zookeeper_path + "/log/log-", log_entry.toString(), @@ -83,6 +81,9 @@ public: zkutil::CreateMode::PersistentSequential)); block_number_lock.getUnlockOps(ops); + MergeTreeData::Transaction transaction; /// Если не получится добавить кусок в ZK, снова уберем его из рабочего набора. + storage.data.renameTempPartAndAdd(part, nullptr, &transaction); + try { auto code = storage.zookeeper->tryMulti(ops); diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 0b2a83cbb74..f6932551256 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -868,6 +868,10 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro zkutil::Ops ops; checkPartAndAddToZooKeeper(part, ops); + /** TODO: Переименование нового куска лучше делать здесь, а не пятью строчками выше, + * чтобы оно было как можно ближе к zookeeper->multi. + */ + zookeeper->multi(ops); /** При ZCONNECTIONLOSS или ZOPERATIONTIMEOUT можем зря откатить локальные изменения кусков. @@ -1807,12 +1811,12 @@ void StorageReplicatedMergeTree::fetchPart(const String & part_name, const Strin MergeTreeData::MutableDataPartPtr part = fetcher.fetchPart(part_name, zookeeper_path + "/replicas/" + replica_name, host, port); + zkutil::Ops ops; + checkPartAndAddToZooKeeper(part, ops, part_name); + MergeTreeData::Transaction transaction; auto removed_parts = data.renameTempPartAndReplace(part, nullptr, &transaction); - zkutil::Ops ops; - checkPartAndAddToZooKeeper(part, ops); - zookeeper->multi(ops); transaction.commit(); merge_selecting_event.set(); From 19ca334f9ce327de5ba30e2e230713df09bc185c Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:42:27 +0400 Subject: [PATCH 051/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index f6932551256..7b172e4817f 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -402,7 +402,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) + toString(parts_to_fetch.size()) + " missing parts"; bool insane = parts_to_add.size() > 2 || - unexpected_parts.size() > 2 || + unexpected_parts.size() > 3 || expected_parts.size() > 5 || parts_to_fetch.size() > 30; From 3c75c3f774e1a6fa0e2663a2249e56cf52baa346 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:46:38 +0400 Subject: [PATCH 052/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 7b172e4817f..488e7a84dc7 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -833,8 +833,8 @@ bool StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry, Backgro } if (part->name != name) { - LOG_ERROR(log, "Log and parts set look inconsistent: " << name << " is covered by " << part->name - << " but should be merged into " << entry.new_part_name); + LOG_WARNING(log, "Part " << name << " is covered by " << part->name + << " but should be merged into " << entry.new_part_name << ". This shouldn't happen often."); have_all_parts = false; break; } From 6189e536676dc2c918523896869b6c44ab4f5fc8 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 14:57:33 +0400 Subject: [PATCH 053/127] Merge --- dbms/include/DB/Storages/MergeTree/MergeTreeData.h | 6 ++++++ dbms/src/Storages/StorageReplicatedMergeTree.cpp | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h index 208e51f3939..b4eba1a978b 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h @@ -115,6 +115,12 @@ struct MergeTreeSettings /// Хранить примерно столько последних записей в логе в ZooKeeper, даже если они никому уже не нужны. /// Не влияет на работу таблиц; используется только чтобы успеть посмотреть на лог в ZooKeeper глазами прежде, чем его очистят. size_t replicated_logs_to_keep = 100; + + /// Максимальное количество ошибок при загрузке кусков, при котором ReplicatedMergeTree соглашается запускаться. + size_t replicated_max_unexpected_parts = 3; + size_t replicated_max_unexpectedly_merged_parts = 2; + size_t replicated_max_missing_obsolete_parts = 5; + size_t replicated_max_missing_active_parts = 20; }; class MergeTreeData : public ITableDeclaration diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 488e7a84dc7..c8f4f8b68f7 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -401,10 +401,10 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) + toString(expected_parts.size()) + " missing obsolete parts, " + toString(parts_to_fetch.size()) + " missing parts"; bool insane = - parts_to_add.size() > 2 || - unexpected_parts.size() > 3 || - expected_parts.size() > 5 || - parts_to_fetch.size() > 30; + parts_to_add.size() > data.settings.replicated_max_unexpectedly_merged_parts || + unexpected_parts.size() > data.settings.replicated_max_unexpected_parts || + expected_parts.size() > data.settings.replicated_max_missing_obsolete_parts || + parts_to_fetch.size() > data.settings.replicated_max_missing_active_parts; if (insane && !skip_sanity_checks) { From 6c194f5a00994502d94660720cfed3f13b8ec818 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 15:11:19 +0400 Subject: [PATCH 054/127] fixed PartChecker. [#METR-12155] --- dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h b/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h index 417ddd6d7f8..bf653f2a7bb 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreePartChecker.h @@ -19,7 +19,7 @@ public: Settings & setVerbose(bool verbose_) { verbose = verbose_; return *this; } Settings & setRequireChecksums(bool require_checksums_) { require_checksums = require_checksums_; return *this; } Settings & setRequireColumnFiles(bool require_column_files_) { require_column_files = require_column_files_; return *this; } - Settings & setIndexGranularity(bool index_granularity_) { index_granularity = index_granularity_; return *this; } + Settings & setIndexGranularity(size_t index_granularity_) { index_granularity = index_granularity_; return *this; } }; /** Полностью проверяет данные кусочка: From eaf4a79f1191c2ec2302541d8d577eaaeefdaead Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 8 Aug 2014 16:53:55 +0400 Subject: [PATCH 055/127] Merge --- .../DB/Storages/StorageReplicatedMergeTree.h | 2 +- .../src/Storages/StorageReplicatedMergeTree.cpp | 17 +++++++++++++---- libs/libzkutil/src/ZooKeeper.cpp | 6 ++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 9c2d040fa1d..09d4d3d1d74 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -293,6 +293,7 @@ private: /// Поток, обрабатывающий переподключение к ZooKeeper при истечении сессии (очень маловероятное событие). std::thread restarting_thread; + Poco::Event restarting_event; /// Поток, следящий за изменениями списка столбцов в ZooKeeper и обновляющий куски в соответствии с этими изменениями. std::thread alter_thread; @@ -311,7 +312,6 @@ private: Poco::Event shutdown_event; /// Нужно ли завершить restarting_thread. volatile bool permanent_shutdown_called = false; - Poco::Event permanent_shutdown_event; StorageReplicatedMergeTree( const String & zookeeper_path_, diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index c8f4f8b68f7..6c6ebe8f3d1 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -52,7 +52,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( std::bind(&StorageReplicatedMergeTree::enqueuePartForCheck, this, std::placeholders::_1)), reader(data), writer(data), merger(data), fetcher(data), log(&Logger::get(database_name_ + "." + table_name + " (StorageReplicatedMergeTree)")), - shutdown_event(false), permanent_shutdown_event(false) + shutdown_event(false) { if (!zookeeper) { @@ -1117,6 +1117,15 @@ void StorageReplicatedMergeTree::queueUpdatingThread() queue_updating_event->wait(); } + catch (zkutil::KeeperException & e) + { + if (e.code == ZINVALIDSTATE) + restarting_event.set(); + + tryLogCurrentException(__PRETTY_FUNCTION__); + + queue_updating_event->tryWait(ERROR_SLEEP_MS); + } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); @@ -1842,7 +1851,7 @@ void StorageReplicatedMergeTree::shutdown() } permanent_shutdown_called = true; - permanent_shutdown_event.set(); + restarting_event.set(); restarting_thread.join(); endpoint_holder = nullptr; @@ -1891,7 +1900,7 @@ void StorageReplicatedMergeTree::goReadOnly() is_read_only = true; permanent_shutdown_called = true; - permanent_shutdown_event.set(); + restarting_event.set(); partialShutdown(); } @@ -1942,7 +1951,7 @@ void StorageReplicatedMergeTree::restartingThread() startup(); } - permanent_shutdown_event.tryWait(60 * 1000); + restarting_event.tryWait(60 * 1000); } } catch (...) diff --git a/libs/libzkutil/src/ZooKeeper.cpp b/libs/libzkutil/src/ZooKeeper.cpp index 35fd1831873..b825c58bbe8 100644 --- a/libs/libzkutil/src/ZooKeeper.cpp +++ b/libs/libzkutil/src/ZooKeeper.cpp @@ -482,15 +482,21 @@ void ZooKeeper::tryRemoveRecursive(const std::string & path) ZooKeeper::~ZooKeeper() { + LOG_INFO(&Logger::get("~ZooKeeper"), "Closing ZooKeeper session"); + int code = zookeeper_close(impl); if (code != ZOK) { LOG_ERROR(&Logger::get("~ZooKeeper"), "Failed to close ZooKeeper session: " << zerror(code)); } + LOG_INFO(&Logger::get("~ZooKeeper"), "Removing " << watch_store.size() << " watches"); + /// удаляем WatchWithEvent которые уже никогда не будут обработаны for (WatchWithEvent * watch : watch_store) delete watch; + + LOG_INFO(&Logger::get("~ZooKeeper"), "Removed watches"); } ZooKeeperPtr ZooKeeper::startNewSession() const From 8038d2ab6676be5805b65d9724168a84c0558178 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Aug 2014 05:51:03 +0400 Subject: [PATCH 056/127] dbms: added function bar [#METR-12249]. --- .../DB/Functions/FunctionsMiscellaneous.h | 220 +++++++++++++++++- dbms/src/Functions/FunctionFactory.cpp | 1 + 2 files changed, 211 insertions(+), 10 deletions(-) diff --git a/dbms/include/DB/Functions/FunctionsMiscellaneous.h b/dbms/include/DB/Functions/FunctionsMiscellaneous.h index 29191faadc7..fb3042bc047 100644 --- a/dbms/include/DB/Functions/FunctionsMiscellaneous.h +++ b/dbms/include/DB/Functions/FunctionsMiscellaneous.h @@ -50,6 +50,8 @@ namespace DB * не предназначена для пользователя, а используется только как prerequisites для функций высшего порядка. * * sleep(n) - спит n секунд каждый блок. + * + * bar(x, min, max, width) - рисует полосу из количества символов, пропорционального (x - min) и равного width при x == max. */ @@ -274,7 +276,7 @@ public: { const IColumn & argument = *block.getByPosition(arguments[0]).column; if (!argument.isConst()) - throw Exception("Argument for function 'materialize' must be constant.", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Argument for function " + getName() + " must be constant.", ErrorCodes::ILLEGAL_COLUMN); block.getByPosition(result).column = dynamic_cast(argument).convertToFullColumn(); } @@ -354,7 +356,7 @@ public: DataTypePtr getReturnType(const DataTypes & arguments) const { if (arguments.size() < 2) - throw Exception("Function tuple requires at least two arguments.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception("Function " + getName() + " requires at least two arguments.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); return new DataTypeTuple(arguments); } @@ -386,18 +388,18 @@ public: ExpressionActions::Actions & out_prerequisites) { if (arguments.size() != 2) - throw Exception("Function tupleElement requires exactly two arguments: tuple and element index.", + throw Exception("Function " + getName() + " requires exactly two arguments: tuple and element index.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); const ColumnConstUInt8 * index_col = typeid_cast(&*arguments[1].column); if (!index_col) - throw Exception("Second argument to tupleElement must be a constant UInt8", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("Second argument to " + getName() + " must be a constant UInt8", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); size_t index = index_col->getData(); const DataTypeTuple * tuple = typeid_cast(&*arguments[0].type); if (!tuple) - throw Exception("First argument for function tupleElement must be tuple.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("First argument for function " + getName() + " must be tuple.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); if (index == 0) throw Exception("Indices in tuples are 1-based.", ErrorCodes::ILLEGAL_INDEX); @@ -417,10 +419,10 @@ public: const ColumnConstUInt8 * index_col = typeid_cast(&*block.getByPosition(arguments[1]).column); if (!tuple_col) - throw Exception("First argument for function tupleElement must be tuple.", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("First argument for function " + getName() + " must be tuple.", ErrorCodes::ILLEGAL_COLUMN); if (!index_col) - throw Exception("Second argument for function tupleElement must be UInt8 constant literal.", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Second argument for function " + getName() + " must be UInt8 constant literal.", ErrorCodes::ILLEGAL_COLUMN); size_t index = index_col->getData(); if (index == 0) @@ -472,11 +474,11 @@ public: DataTypePtr getReturnType(const DataTypes & arguments) const { if (arguments.size() != 1) - throw Exception("Function arrayJoin requires exactly one argument.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception("Function " + getName() + " requires exactly one argument.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); const DataTypeArray * arr = typeid_cast(&*arguments[0]); if (!arr) - throw Exception("Argument for function arrayJoin must be Array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("Argument for function " + getName() + " must be Array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return arr->getNestedType()->clone(); } @@ -484,7 +486,7 @@ public: /// Выполнить функцию над блоком. void execute(Block & block, const ColumnNumbers & arguments, size_t result) { - throw Exception("Function arrayJoin must not be executed directly.", ErrorCodes::FUNCTION_IS_SPECIAL); + throw Exception("Function " + getName() + " must not be executed directly.", ErrorCodes::FUNCTION_IS_SPECIAL); } }; @@ -539,4 +541,202 @@ class FunctionReplicate : public IFunction } }; + +class FunctionBar : public IFunction +{ +public: + /// Получить имя функции. + String getName() const + { + return "bar"; + } + + /// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение. + DataTypePtr getReturnType(const DataTypes & arguments) const + { + if (arguments.size() != 3 && arguments.size() != 4) + throw Exception("Function " + getName() + " requires from 3 or 4 parameters: value, min_value, max_value, [max_width_of_bar = 80]. Passed " + + toString(arguments.size()) + ".", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + if (!arguments[0]->isNumeric() || !arguments[1]->isNumeric() || !arguments[2]->isNumeric() + || (arguments.size() == 4 && !arguments[3]->isNumeric())) + throw Exception("All arguments for function " + getName() + " must be numeric.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return new DataTypeString; + } + + /// Выполнить функцию над блоком. + void execute(Block & block, const ColumnNumbers & arguments, size_t result) + { + Int64 min = extractConstant(block, arguments, 1, "Second"); /// Уровень значения, при котором полоска имеет нулевую длину. + Int64 max = extractConstant(block, arguments, 2, "Third"); /// Уровень значения, при котором полоска имеет максимальную длину. + + /// Максимальная ширина полоски в символах, по-умолчанию. + Float64 max_width = arguments.size() == 4 + ? extractConstant(block, arguments, 3, "Fourth") + : 80; + + if (max_width < 1) + throw Exception("Max_width argument must be >= 1.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + if (max_width > 1000) + throw Exception("Too large max_width.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + const auto & src = *block.getByPosition(arguments[0]).column; + + if (src.isConst()) + { + auto res_column = new ColumnConstString(block.rowsInFirstColumn(), ""); + block.getByPosition(result).column = res_column; + + if ( executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width) + || executeConstNumber (src, *res_column, min, max, max_width)) + { + } + else + throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + auto res_column = new ColumnString; + block.getByPosition(result).column = res_column; + + if ( executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width) + || executeNumber (src, *res_column, min, max, max_width)) + { + } + else + throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + } + +private: + template + T extractConstant(Block & block, const ColumnNumbers & arguments, size_t argument_pos, const char * which_argument) const + { + const auto & column = *block.getByPosition(arguments[argument_pos]).column; + + if (!column.isConst()) + throw Exception(which_argument + String(" argument for function ") + getName() + " must be constant.", ErrorCodes::ILLEGAL_COLUMN); + + return apply_visitor(FieldVisitorConvertToNumber(), column[0]); + } + + static constexpr size_t BAR_CHAR_SIZE = strlen("█"); + + template + static Float64 barWidth(T x, Int64 min, Int64 max, Float64 max_width) + { + if (x <= min) + return 0; + + if (x >= max) + return max_width; + + return (x - min) * max_width / (max - min); + } + + static size_t barWidthInBytes(Float64 width) + { + return ceil(width - 1.0 / 8) * BAR_CHAR_SIZE; + } + + /// В dst должно быть место для barWidthInBytes(width) символов и завершающего нуля. + static void renderBar(Float64 width, char * dst) + { + size_t floor_width = floor(width); + + for (size_t i = 0; i < floor_width; ++i) + { + memcpy(dst, "█", BAR_CHAR_SIZE); + dst += BAR_CHAR_SIZE; + } + + size_t remainder = floor((width - floor_width) * 8); + + if (remainder) + { + memcpy(dst, &"▏▎▍▌▋▋▊▉"[(remainder - 1) * BAR_CHAR_SIZE], BAR_CHAR_SIZE); + dst += BAR_CHAR_SIZE; + } + + *dst = 0; + } + + template + static void fill(const PODArray & src, ColumnString::Chars_t & dst_chars, ColumnString::Offsets_t & dst_offsets, + Int64 min, Int64 max, Float64 max_width) + { + size_t size = src.size(); + size_t current_offset = 0; + + dst_offsets.resize(size); + dst_chars.reserve(size * (barWidthInBytes(max_width) + 1)); /// строки 0-terminated. + + for (size_t i = 0; i < size; ++i) + { + Float64 width = barWidth(src[i], min, max, max_width); + size_t next_size = current_offset + barWidthInBytes(width) + 1; + dst_chars.resize(next_size); + renderBar(width, reinterpret_cast(&dst_chars[current_offset])); + current_offset = next_size; + dst_offsets[i] = current_offset; + } + } + + template + static void fill(T src, String & dst_chars, + Int64 min, Int64 max, Float64 max_width) + { + Float64 width = barWidth(src, min, max, max_width); + dst_chars.resize(barWidthInBytes(width)); + renderBar(width, &dst_chars[0]); + } + + template + static bool executeNumber(const IColumn & src, ColumnString & dst, Int64 min, Int64 max, Float64 max_width) + { + if (const ColumnVector * col = typeid_cast *>(&src)) + { + fill(col->getData(), dst.getChars(), dst.getOffsets(), min, max, max_width); + return true; + } + else + return false; + } + + template + static bool executeConstNumber(const IColumn & src, ColumnConstString & dst, Int64 min, Int64 max, Float64 max_width) + { + if (const ColumnConst * col = typeid_cast *>(&src)) + { + fill(col->getData(), dst.getData(), min, max, max_width); + return true; + } + else + return false; + } +}; + } diff --git a/dbms/src/Functions/FunctionFactory.cpp b/dbms/src/Functions/FunctionFactory.cpp index 98529dc17e7..e5dcceec489 100644 --- a/dbms/src/Functions/FunctionFactory.cpp +++ b/dbms/src/Functions/FunctionFactory.cpp @@ -181,6 +181,7 @@ FunctionPtr FunctionFactory::get( else if (name == "hostName") return new FunctionHostName; else if (name == "visibleWidth") return new FunctionVisibleWidth; + else if (name == "bar") return new FunctionBar; else if (name == "toTypeName") return new FunctionToTypeName; else if (name == "blockSize") return new FunctionBlockSize; else if (name == "sleep") return new FunctionSleep; From 69ba035488c225b507f51fba63665b86554b5eb1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Aug 2014 06:29:03 +0400 Subject: [PATCH 057/127] dbms: fixed error with function negate on UInt32 [#METR-2944]. --- dbms/include/DB/Functions/FunctionsArithmetic.h | 2 +- dbms/tests/queries/0_stateless/00064_negate_bug.reference | 1 + dbms/tests/queries/0_stateless/00064_negate_bug.sql | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/00064_negate_bug.reference create mode 100644 dbms/tests/queries/0_stateless/00064_negate_bug.sql diff --git a/dbms/include/DB/Functions/FunctionsArithmetic.h b/dbms/include/DB/Functions/FunctionsArithmetic.h index 7996d8f5818..45c6834f316 100644 --- a/dbms/include/DB/Functions/FunctionsArithmetic.h +++ b/dbms/include/DB/Functions/FunctionsArithmetic.h @@ -221,7 +221,7 @@ struct NegateImpl static inline ResultType apply(A a) { - return -a; + return -static_cast(a); } }; diff --git a/dbms/tests/queries/0_stateless/00064_negate_bug.reference b/dbms/tests/queries/0_stateless/00064_negate_bug.reference new file mode 100644 index 00000000000..601e8c52f8b --- /dev/null +++ b/dbms/tests/queries/0_stateless/00064_negate_bug.reference @@ -0,0 +1 @@ +-1 Int64 diff --git a/dbms/tests/queries/0_stateless/00064_negate_bug.sql b/dbms/tests/queries/0_stateless/00064_negate_bug.sql new file mode 100644 index 00000000000..ba076770168 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00064_negate_bug.sql @@ -0,0 +1 @@ +SELECT -toUInt32(1) AS x, toTypeName(x) AS t From 9a946d73b9ffafafb1d15ee5a60c1e1ece71f12d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Aug 2014 02:36:07 +0400 Subject: [PATCH 058/127] dbms: fixed error with formatting floating point literals in query [#METR-2944]. --- dbms/include/DB/Core/Field.h | 40 ++++++++++++++++--- .../00065_float_literals_formatting.reference | 2 + .../00065_float_literals_formatting.sql | 1 + 3 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00065_float_literals_formatting.reference create mode 100644 dbms/tests/queries/0_stateless/00065_float_literals_formatting.sql diff --git a/dbms/include/DB/Core/Field.h b/dbms/include/DB/Core/Field.h index 48558f9717e..7bf0fe6e288 100644 --- a/dbms/include/DB/Core/Field.h +++ b/dbms/include/DB/Core/Field.h @@ -582,11 +582,39 @@ private: writeQuoted(x, wb); return res; } + + /** В отличие от writeFloatText (и writeQuoted), если число после форматирования выглядит целым, всё равно добавляет десятичную точку. + * - для того, чтобы это число могло обратно распарситься как Float64 парсером запроса (иначе распарсится как целое). + * + * При этом, не оставляет завершающие нули справа. + * + * NOTE: При таком roundtrip-е, точность может теряться. + */ + static inline String formatFloat(Float64 x) + { + char tmp[24]; + int res = std::snprintf(tmp, 23, "%.*g", WRITE_HELPERS_DEFAULT_FLOAT_PRECISION, x); + + if (res >= 23 || res <= 0) + throw Exception("Cannot print float or double number", ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER); + + size_t string_size = res; + + tmp[23] = '\0'; + if (string_size == strspn(tmp, "-0123456789")) + { + tmp[string_size] = '.'; + ++string_size; + } + + return {tmp, string_size}; + } + public: String operator() (const Null & x) const { return "NULL"; } String operator() (const UInt64 & x) const { return formatQuoted(x); } String operator() (const Int64 & x) const { return formatQuoted(x); } - String operator() (const Float64 & x) const { return formatQuoted(x); } + String operator() (const Float64 & x) const { return formatFloat(x); } String operator() (const String & x) const { return formatQuoted(x); } String operator() (const Array & x) const @@ -689,7 +717,7 @@ namespace DB { class ReadBuffer; class WriteBuffer; - + /// Предполагается что у всех элементов массива одинаковый тип. inline void readBinary(Array & x, ReadBuffer & buf) { @@ -745,7 +773,7 @@ namespace DB }; } } - + inline void readText(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } inline void readQuoted(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } @@ -758,7 +786,7 @@ namespace DB type = x.front().getType(); DB::writeBinary(type, buf); DB::writeBinary(size, buf); - + for (Array::const_iterator it = x.begin(); it != x.end(); ++it) { switch (type) @@ -792,13 +820,13 @@ namespace DB }; } } - + inline void writeText(const Array & x, WriteBuffer & buf) { DB::String res = apply_visitor(DB::FieldVisitorToString(), DB::Field(x)); buf.write(res.data(), res.size()); } - + inline void writeQuoted(const Array & x, WriteBuffer & buf) { throw Exception("Cannot write Array quoted.", ErrorCodes::NOT_IMPLEMENTED); } } diff --git a/dbms/tests/queries/0_stateless/00065_float_literals_formatting.reference b/dbms/tests/queries/0_stateless/00065_float_literals_formatting.reference new file mode 100644 index 00000000000..f3ba2eafb28 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00065_float_literals_formatting.reference @@ -0,0 +1,2 @@ +Float64 +Float64 diff --git a/dbms/tests/queries/0_stateless/00065_float_literals_formatting.sql b/dbms/tests/queries/0_stateless/00065_float_literals_formatting.sql new file mode 100644 index 00000000000..0b3bffdcf9d --- /dev/null +++ b/dbms/tests/queries/0_stateless/00065_float_literals_formatting.sql @@ -0,0 +1 @@ +SELECT toTypeName(1.0) FROM remote('127.0.0.{1,2}', system, one) From 9e0976aef48086782690a346d3142b54bc98b407 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Aug 2014 02:40:09 +0400 Subject: [PATCH 059/127] dbms: updated tests [#METR-2944]. --- dbms/tests/queries/0_stateless/00031_parser_number.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/00031_parser_number.reference b/dbms/tests/queries/0_stateless/00031_parser_number.reference index dba9f499bfc..aace798796f 100644 --- a/dbms/tests/queries/0_stateless/00031_parser_number.reference +++ b/dbms/tests/queries/0_stateless/00031_parser_number.reference @@ -1,3 +1,3 @@ -0 1 -1 128 -127 -128 255 -128 255 -127 65535 4294967295 12300 4656 -0 -0 0 18446744073709551615 2.09883e+19 -1.84467e+19 -9223372036854775807 -8.98847e+307 -2.22507e-308 inf -inf nan -nan 1e-302 UInt8 UInt8 Int8 UInt8 Int8 Int8 UInt8 Int8 UInt8 Int8 UInt16 UInt32 Float64 Float64 Float64 Float64 UInt8 UInt64 Float64 Float64 Int64 Float64 Float64 Float64 Float64 Float64 Float32 Float64 +0 1 -1 128 -127 -128 255 -128 255 -127 65535 4294967295 12300 4656 -0 -0 0 18446744073709551615 2.09883e+19 -1.84467e+19 -9223372036854775807 -8.98847e+307 -2.22507e-308 inf -inf nan -nan 1e-302 UInt8 UInt8 Int8 UInt8 Int8 Int8 UInt8 Int8 UInt8 Int8 UInt16 UInt32 Float64 Float64 Float64 Float64 Float64 UInt64 Float64 Float64 Int64 Float64 Float64 Float64 Float64 Float64 Float32 Float64 1e+308 -1e-307 From 971636f733497da1e20646a62bfa8c071631ac96 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 11 Aug 2014 14:59:56 +0400 Subject: [PATCH 060/127] added files missing in previous commit. [#METR-12236] --- .../src/tests/zkutil_expiration_test.cpp | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 libs/libzkutil/src/tests/zkutil_expiration_test.cpp diff --git a/libs/libzkutil/src/tests/zkutil_expiration_test.cpp b/libs/libzkutil/src/tests/zkutil_expiration_test.cpp new file mode 100644 index 00000000000..96ab72c7114 --- /dev/null +++ b/libs/libzkutil/src/tests/zkutil_expiration_test.cpp @@ -0,0 +1,67 @@ +#include +#include + + +/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии. +/// Спойлер: multi иногда падает с segfault, а до этого фейлится с marshalling error. +/// create всегда фейлится с invalid zhandle state. + +int main(int argc, char ** argv) +{ + try + { + if (argc != 2) + { + std::cerr << "usage: " << argv[0] << " hosts" << std::endl; + return 2; + } + + Poco::AutoPtr channel = new Poco::ConsoleChannel(std::cerr); + Logger::root().setChannel(channel); + Logger::root().setLevel("trace"); + + zkutil::ZooKeeper zk(argv[1]); + std::string unused; + zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused); + + std::cerr << "Please run `./nozk.sh && sleep 40s && ./yeszk.sh`" << std::endl; + + time_t time0 = time(0); + + while (true) + { + { + zkutil::Ops ops; + ops.push_back(new zkutil::Op::Create("/test/zk_expiration_test", "hello", zk.getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Remove("/test/zk_expiration_test", -1)); + + int code; + try + { + code = zk.tryMulti(ops);std::string unused; + //code = zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused); + } + catch (zkutil::KeeperException & e) + { + code = e.code; + } + + std::cout << time(0) - time0 << "s: " << zkutil::ZooKeeper::error2string(code) << std::endl; + } + + sleep(1); + } + } + catch (zkutil::KeeperException & e) + { + std::cerr << "KeeperException: " << e.displayText() << std::endl; + return 1; + } + catch (...) + { + std::cerr << "Some exception" << std::endl; + return 2; + } + + return 0; +} From dd90d58dffc38bfcedb9812b42577c4c1d05a675 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 11 Aug 2014 15:24:10 +0400 Subject: [PATCH 061/127] zkutil: avoiding segfault in multi on session expiration. [#METR-12236] --- libs/libzkutil/src/ZooKeeper.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libs/libzkutil/src/ZooKeeper.cpp b/libs/libzkutil/src/ZooKeeper.cpp index b825c58bbe8..523ebd93114 100644 --- a/libs/libzkutil/src/ZooKeeper.cpp +++ b/libs/libzkutil/src/ZooKeeper.cpp @@ -369,6 +369,12 @@ int32_t ZooKeeper::multiImpl(const Ops & ops_, OpResultsPtr * out_results_) if (ops_.empty()) return ZOK; + /// Workaround ошибки в сишном клиенте ZooKeeper. Если сессия истекла, zoo_multi иногда падает с segfault. + /// Наверно, здесь есть race condition, и возможен segfault, если сессия истечет между этой проверкой и zoo_multi. + /// TODO: Посмотреть, не исправлено ли это в последней версии клиента, и исправить. + if (expired()) + return ZINVALIDSTATE; + size_t count = ops_.size(); OpResultsPtr out_results(new OpResults(count)); From 7b11fc4f6c5736fe208acbc76c23fd29683108b2 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 11 Aug 2014 18:00:24 +0400 Subject: [PATCH 062/127] Merge --- dbms/include/DB/Core/ErrorCodes.h | 1 + .../DB/Storages/StorageReplicatedMergeTree.h | 2 +- .../Storages/StorageReplicatedMergeTree.cpp | 209 +++++++++++------- 3 files changed, 133 insertions(+), 79 deletions(-) diff --git a/dbms/include/DB/Core/ErrorCodes.h b/dbms/include/DB/Core/ErrorCodes.h index 4d01ba598a6..ac69ce20aaa 100644 --- a/dbms/include/DB/Core/ErrorCodes.h +++ b/dbms/include/DB/Core/ErrorCodes.h @@ -258,6 +258,7 @@ namespace ErrorCodes INVALID_PARTITION_NAME, NOT_LEADER, NOT_ENOUGH_BLOCK_NUMBERS, + NO_SUCH_REPLICA, POCO_EXCEPTION = 1000, STD_EXCEPTION, diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 09d4d3d1d74..258c826a844 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -332,7 +332,7 @@ private: /** Создает минимальный набор нод в ZooKeeper. */ - void createTable(); + void createTableIfNotExists(); /** Создает реплику в ZooKeeper и добавляет в очередь все, что нужно, чтобы догнать остальные реплики. */ diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 6c6ebe8f3d1..f45faf40403 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -69,8 +69,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (!attach) { - if (!zookeeper->exists(zookeeper_path)) - createTable(); + createTableIfNotExists(); checkTableStructure(false); createReplica(); @@ -151,11 +150,12 @@ static String formattedAST(const ASTPtr & ast) return ss.str(); } -void StorageReplicatedMergeTree::createTable() +void StorageReplicatedMergeTree::createTableIfNotExists() { - LOG_DEBUG(log, "Creating table " << zookeeper_path); + if (zookeeper->exists(zookeeper_path)) + return; - zookeeper->create(zookeeper_path, "", zkutil::CreateMode::Persistent); + LOG_DEBUG(log, "Creating table " << zookeeper_path); /// Запишем метаданные таблицы, чтобы реплики могли сверять с ними параметры таблицы. std::stringstream metadata; @@ -167,17 +167,31 @@ void StorageReplicatedMergeTree::createTable() metadata << "sign column: " << data.sign_column << std::endl; metadata << "primary key: " << formattedAST(data.primary_expr_ast) << std::endl; - zookeeper->create(zookeeper_path + "/metadata", metadata.str(), zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/columns", data.getColumnsList().toString(), zkutil::CreateMode::Persistent); + zkutil::Ops ops; + ops.push_back(new zkutil::Op::Create(zookeeper_path, "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/metadata", metadata.str(), + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/columns", data.getColumnsList().toString(), + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/log", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/blocks", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/block_numbers", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/nonincrement_block_numbers", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/leader_election", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/temp", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(zookeeper_path + "/replicas", "", + zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); - zookeeper->create(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/blocks", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/block_numbers", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/nonincrement_block_numbers", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/leader_election", "", zkutil::CreateMode::Persistent); - zookeeper->create(zookeeper_path + "/temp", "", zkutil::CreateMode::Persistent); - /// Создадим replicas в последнюю очередь, чтобы нельзя было добавить реплику, пока все остальные ноды не созданы. - zookeeper->create(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent); + auto code = zookeeper->tryMulti(ops); + if (code != ZOK && code != ZNODEEXISTS) + throw zkutil::KeeperException(code); } /** Проверить, что список столбцов и настройки таблицы совпадают с указанными в ZK (/metadata). @@ -230,81 +244,120 @@ void StorageReplicatedMergeTree::createReplica() { LOG_DEBUG(log, "Creating replica " << replica_path); - /** Запомним список других реплик. - * NOTE: Здесь есть race condition. Если почти одновременно добавить нескольких реплик, сразу же начиная в них писать, - * небольшая часть данных может не реплицироваться. - */ - Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); - - /// Создадим пустую реплику. - zookeeper->create(replica_path, "", zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/columns", data.getColumnsList().toString(), zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/host", "", zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/log_pointer", "", zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/queue", "", zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/parts", "", zkutil::CreateMode::Persistent); - zookeeper->create(replica_path + "/flags", "", zkutil::CreateMode::Persistent); + /// Создадим пустую реплику. Ноду columns создадим в конце - будем использовать ее в качестве признака, что создание реплики завершено. + zkutil::Ops ops; + ops.push_back(new zkutil::Op::Create(replica_path, "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(replica_path + "/host", "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(replica_path + "/log_pointer", "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(replica_path + "/queue", "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(replica_path + "/parts", "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + ops.push_back(new zkutil::Op::Create(replica_path + "/flags", "", zookeeper->getDefaultACL(), zkutil::CreateMode::Persistent)); + zookeeper->multi(ops); /** Нужно изменить данные ноды /replicas на что угодно, чтобы поток, удаляющий старые записи в логе, * споткнулся об это изменение и не удалил записи, которые мы еще не прочитали. */ zookeeper->set(zookeeper_path + "/replicas", "last added replica: " + replica_name); - if (replicas.empty()) + Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); + + /** "Эталонная" реплика, у которой мы возьмем информацию о множестве кусков, очередь и указатель на лог. + * Возьмем случайную из реплик, созданных раньше этой. + */ + String source_replica; + + Stat stat; + zookeeper->exists(replica_path, &stat); + auto my_create_time = stat.czxid; + + std::random_shuffle(replicas.begin(), replicas.end()); + for (const String & replica : replicas) { - LOG_DEBUG(log, "No other replicas"); - return; + if (!zookeeper->exists(zookeeper_path + "/replicas/" + replica, &stat)) + throw Exception("Replica " + zookeeper_path + "/replicas/" + replica + " was removed from right under our feet.", + ErrorCodes::NO_SUCH_REPLICA); + if (stat.czxid < my_create_time) + { + source_replica = replica; + break; + } } - /// "Эталонная" реплика, у которой мы возьмем информацию о множестве кусков, очередь и указатель на лог. - String source_replica = replicas[rand() % replicas.size()]; - - LOG_INFO(log, "Will mimic " << source_replica); - - String source_path = zookeeper_path + "/replicas/" + source_replica; - - /// Порядок следующих трех действий важен. Записи в логе могут продублироваться, но не могут потеряться. - - /// Скопируем у эталонной реплики ссылку на лог. - zookeeper->set(replica_path + "/log_pointer", zookeeper->get(source_path + "/log_pointer")); - - /// Запомним очередь эталонной реплики. - Strings source_queue_names = zookeeper->getChildren(source_path + "/queue"); - std::sort(source_queue_names.begin(), source_queue_names.end()); - Strings source_queue; - for (const String & entry_name : source_queue_names) + if (source_replica.empty()) { - String entry; - if (!zookeeper->tryGet(source_path + "/queue/" + entry_name, entry)) - continue; - source_queue.push_back(entry); + LOG_INFO(log, "This is the first replica"); + } + else + { + LOG_INFO(log, "Will mimic " << source_replica); + + String source_path = zookeeper_path + "/replicas/" + source_replica; + + /** Если эталонная реплика еще не до конца создана, подождем. + * NOTE: Если при ее создании что-то пошло не так, можем провисеть тут вечно. + * Можно создавать на время создания эфемерную ноду, чтобы быть уверенным, что реплика создается, а не заброшена. + * То же можно делать и для таблицы. Можно автоматически удалять ноду реплики/таблицы, + * если видно, что она создана не до конца, а создающий ее умер. + */ + while (!zookeeper->exists(source_path + "/columns")) + { + LOG_INFO(log, "Waiting for replica " << source_path << " to be fully created"); + + zkutil::EventPtr event = new Poco::Event; + if (zookeeper->exists(source_path + "/columns", nullptr, event)) + { + LOG_WARNING(log, "Oops, a watch has leaked"); + break; + } + + event->wait(); + } + + /// Порядок следующих трех действий важен. Записи в логе могут продублироваться, но не могут потеряться. + + /// Скопируем у эталонной реплики ссылку на лог. + zookeeper->set(replica_path + "/log_pointer", zookeeper->get(source_path + "/log_pointer")); + + /// Запомним очередь эталонной реплики. + Strings source_queue_names = zookeeper->getChildren(source_path + "/queue"); + std::sort(source_queue_names.begin(), source_queue_names.end()); + Strings source_queue; + for (const String & entry_name : source_queue_names) + { + String entry; + if (!zookeeper->tryGet(source_path + "/queue/" + entry_name, entry)) + continue; + source_queue.push_back(entry); + } + + /// Добавим в очередь задания на получение всех активных кусков, которые есть у эталонной реплики. + Strings parts = zookeeper->getChildren(source_path + "/parts"); + ActiveDataPartSet active_parts_set; + for (const String & part : parts) + { + active_parts_set.add(part); + } + Strings active_parts = active_parts_set.getParts(); + for (const String & name : active_parts) + { + LogEntry log_entry; + log_entry.type = LogEntry::GET_PART; + log_entry.source_replica = ""; + log_entry.new_part_name = name; + + zookeeper->create(replica_path + "/queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential); + } + LOG_DEBUG(log, "Queued " << active_parts.size() << " parts to be fetched"); + + /// Добавим в очередь содержимое очереди эталонной реплики. + for (const String & entry : source_queue) + { + zookeeper->create(replica_path + "/queue/queue-", entry, zkutil::CreateMode::PersistentSequential); + } + LOG_DEBUG(log, "Copied " << source_queue.size() << " queue entries"); } - /// Добавим в очередь задания на получение всех активных кусков, которые есть у эталонной реплики. - Strings parts = zookeeper->getChildren(source_path + "/parts"); - ActiveDataPartSet active_parts_set; - for (const String & part : parts) - { - active_parts_set.add(part); - } - Strings active_parts = active_parts_set.getParts(); - for (const String & name : active_parts) - { - LogEntry log_entry; - log_entry.type = LogEntry::GET_PART; - log_entry.source_replica = ""; - log_entry.new_part_name = name; - - zookeeper->create(replica_path + "/queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential); - } - LOG_DEBUG(log, "Queued " << active_parts.size() << " parts to be fetched"); - - /// Добавим в очередь содержимое очереди эталонной реплики. - for (const String & entry : source_queue) - { - zookeeper->create(replica_path + "/queue/queue-", entry, zkutil::CreateMode::PersistentSequential); - } - LOG_DEBUG(log, "Copied " << source_queue.size() << " queue entries"); + zookeeper->create(replica_path + "/columns", data.getColumnsList().toString(), zkutil::CreateMode::Persistent); } void StorageReplicatedMergeTree::activateReplica() From 5e0e9bbc001497fe7acc92b1257a8b26048f47f4 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 11 Aug 2014 18:05:38 +0400 Subject: [PATCH 063/127] Merge --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 2 ++ libs/libzkutil/include/zkutil/ZooKeeper.h | 4 ++++ libs/libzkutil/src/ZooKeeper.cpp | 13 +++++++++++++ 3 files changed, 19 insertions(+) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index f45faf40403..3ddb8c49400 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -157,6 +157,8 @@ void StorageReplicatedMergeTree::createTableIfNotExists() LOG_DEBUG(log, "Creating table " << zookeeper_path); + zookeeper->createAncestors(zookeeper_path); + /// Запишем метаданные таблицы, чтобы реплики могли сверять с ними параметры таблицы. std::stringstream metadata; metadata << "metadata format version: 1" << std::endl; diff --git a/libs/libzkutil/include/zkutil/ZooKeeper.h b/libs/libzkutil/include/zkutil/ZooKeeper.h index e45db4addb1..999c68319b6 100644 --- a/libs/libzkutil/include/zkutil/ZooKeeper.h +++ b/libs/libzkutil/include/zkutil/ZooKeeper.h @@ -84,6 +84,10 @@ public: */ void createIfNotExists(const std::string & path, const std::string & data); + /** Создает всех еще не существующих предков ноды, с пустыми данными. Саму указанную ноду не создает. + */ + void createAncestors(const std::string & path); + /** Удалить ноду, если ее версия равна version (если -1, подойдет любая версия). */ void remove(const std::string & path, int32_t version = -1); diff --git a/libs/libzkutil/src/ZooKeeper.cpp b/libs/libzkutil/src/ZooKeeper.cpp index 523ebd93114..d0584bf2ea4 100644 --- a/libs/libzkutil/src/ZooKeeper.cpp +++ b/libs/libzkutil/src/ZooKeeper.cpp @@ -236,6 +236,19 @@ void ZooKeeper::createIfNotExists(const std::string & path, const std::string & throw KeeperException(code, path); } +void ZooKeeper::createAncestors(const std::string & path) +{ + size_t pos = 1; + while (true) + { + pos = path.find('/', pos); + if (pos == std::string::npos) + break; + createIfNotExists(path.substr(0, pos), ""); + ++pos; + } +} + int32_t ZooKeeper::removeImpl(const std::string & path, int32_t version) { int32_t code = zoo_delete(impl, path.c_str(), version); From 583f4c114e9c18f46907e416e11ac5092ac9097b Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 11 Aug 2014 19:59:01 +0400 Subject: [PATCH 064/127] Merge --- dbms/include/DB/Common/Macros.h | 27 +++++++++ dbms/include/DB/Interpreters/Context.h | 7 ++- dbms/src/Common/Macros.cpp | 60 +++++++++++++++++++ dbms/src/Interpreters/Context.cpp | 11 ++++ dbms/src/Server/Server.cpp | 3 + .../Storages/StorageReplicatedMergeTree.cpp | 5 +- 6 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 dbms/include/DB/Common/Macros.h create mode 100644 dbms/src/Common/Macros.cpp diff --git a/dbms/include/DB/Common/Macros.h b/dbms/include/DB/Common/Macros.h new file mode 100644 index 00000000000..2855eb6b750 --- /dev/null +++ b/dbms/include/DB/Common/Macros.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/** Раскрывает в строке макросы из конфига. + */ +class Macros +{ +public: + Macros(); + Macros(const Poco::Util::AbstractConfiguration & config, const String & key); + + /// Заменить в строке подстроки вида {macro_name} на значение для macro_name, полученное из конфига. + String expand(const String & s) const; + +private: + typedef std::map MacroMap; + + MacroMap macros; +}; + +} diff --git a/dbms/include/DB/Interpreters/Context.h b/dbms/include/DB/Interpreters/Context.h index b837c7fc673..5dded8069ca 100644 --- a/dbms/include/DB/Interpreters/Context.h +++ b/dbms/include/DB/Interpreters/Context.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -95,8 +96,9 @@ struct ContextShared ViewDependencies view_dependencies; /// Текущие зависимости ConfigurationPtr users_config; /// Конфиг с секциями users, profiles и quotas. InterserverIOHandler interserver_io_handler; /// Обработчик для межсерверной передачи данных. - String default_replica_name; /// Имя реплики из конфига. + String default_replica_name; /// Имя реплики из конфига. DEPRECATED BackgroundProcessingPoolPtr background_pool; /// Пул потоков для фоновой работы, выполняемой таблицами. + Macros macros; /// Подстановки из конфига. /// Кластеры для distributed таблиц /// Создаются при создании Distributed таблиц, так как нужно дождаться пока будут выставлены Settings @@ -242,6 +244,9 @@ public: String getDefaultReplicaName() const; void setDefaultReplicaName(const String & name); + const Macros & getMacros() const; + void setMacros(Macros && macros); + Settings getSettings() const; void setSettings(const Settings & settings_); diff --git a/dbms/src/Common/Macros.cpp b/dbms/src/Common/Macros.cpp new file mode 100644 index 00000000000..49b76ebb11d --- /dev/null +++ b/dbms/src/Common/Macros.cpp @@ -0,0 +1,60 @@ +#include +#include +#include + +namespace DB +{ + +Macros::Macros() {} + +Macros::Macros(const Poco::Util::AbstractConfiguration & config, const String & root_key) +{ + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(root_key, keys); + for (const String & key : keys) + { + macros[key] = config.getString(root_key + "." + key); + } +} + +String Macros::expand(const String & s) const +{ + if (s.find('{') == String::npos) + return s; + + String res; + size_t pos = 0; + while (true) + { + size_t begin = s.find('{', pos); + + if (begin == String::npos) + { + res.append(s, pos, String::npos); + break; + } + else + { + res.append(s, pos, begin - pos); + } + + ++begin; + size_t end = s.find('}', begin); + if (end == String::npos) + throw Exception("Unbalanced { and } in string with macros: \"" + s + "\"", ErrorCodes::SYNTAX_ERROR); + + String macro_name = s.substr(begin, end - begin); + + auto it = macros.find(macro_name); + if (it == macros.end()) + throw Exception("No macro " + macro_name + " in config", ErrorCodes::SYNTAX_ERROR); + + res += it->second; + + pos = end + 1; + } + + return res; +} + +} diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index e35f923b089..1732c05280e 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -459,6 +459,17 @@ void Context::setDefaultReplicaName(const String & name) shared->default_replica_name = name; } +const Macros& Context::getMacros() const +{ + return shared->macros; +} + +void Context::setMacros(Macros && macros) +{ + /// Полагаемся, что это присваивание происходит один раз при старте сервера. Если это не так, нужно использовать мьютекс. + shared->macros = macros; +} + Context & Context::getSessionContext() { diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 7263154c7bf..ebae073cbd8 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -373,6 +373,9 @@ int Server::main(const std::vector & args) if (config().has("replica_name")) global_context->setDefaultReplicaName(config().getString("replica_name")); + if (config().has("macros")) + global_context->setMacros(Macros(config(), "macros")); + std::string users_config_path = config().getString("users_config", config().getString("config-file", "config.xml")); auto users_config_reloader = stdext::make_unique(users_config_path, global_context.get()); diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 3ddb8c49400..fab1636d561 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -45,8 +45,9 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( const MergeTreeSettings & settings_) : context(context_), zookeeper(context.getZooKeeper()), database_name(database_name_), - table_name(name_), full_path(path_ + escapeForFileName(table_name) + '/'), zookeeper_path(zookeeper_path_), - replica_name(replica_name_), + table_name(name_), full_path(path_ + escapeForFileName(table_name) + '/'), + zookeeper_path(context.getMacros().expand(zookeeper_path_)), + replica_name(context.getMacros().expand(replica_name_)), data( full_path, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_, index_granularity_, mode_, sign_column_, settings_, database_name_ + "." + table_name, true, std::bind(&StorageReplicatedMergeTree::enqueuePartForCheck, this, std::placeholders::_1)), From 51f73643c635c2eda09b728e1addccc396945053 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 Aug 2014 04:15:17 +0400 Subject: [PATCH 065/127] =?UTF-8?q?=E2=96=88=E2=96=88=E2=96=88=E2=96=88?= =?UTF-8?q?=E2=96=88=E2=96=88=E2=96=88=E2=96=88=E2=96=88=E2=96=88=E2=96=88?= =?UTF-8?q?:=20fixed=20order=20of=20rows=20processing=20[#METR-12291].?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dbms/include/DB/DataStreams/RemoteBlockOutputStream.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dbms/include/DB/DataStreams/RemoteBlockOutputStream.h b/dbms/include/DB/DataStreams/RemoteBlockOutputStream.h index 712852b4941..4b88e15983b 100644 --- a/dbms/include/DB/DataStreams/RemoteBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/RemoteBlockOutputStream.h @@ -14,8 +14,8 @@ namespace DB class RemoteBlockOutputStream : public IBlockOutputStream { public: - RemoteBlockOutputStream(Connection & connection_, const String & query_) - : connection(connection_), query(query_) + RemoteBlockOutputStream(Connection & connection_, const String & query_, Settings * settings_ = nullptr) + : connection(connection_), query(query_), settings(settings_) { } @@ -26,7 +26,7 @@ public: */ Block sendQueryAndGetSampleBlock() { - connection.sendQuery(query); + connection.sendQuery(query, "", QueryProcessingStage::Complete, settings); sent_query = true; Connection::Packet packet = connection.receivePacket(); @@ -95,6 +95,7 @@ public: private: Connection & connection; String query; + Settings * settings; Block sample_block; bool sent_query = false; From 51992989a6246f3536942a2ffa6c26b41bf3575e Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 12 Aug 2014 13:27:00 +0400 Subject: [PATCH 066/127] Merge --- .../DB/Storages/StorageReplicatedMergeTree.h | 4 +- .../Storages/StorageReplicatedMergeTree.cpp | 95 +++++++++++++------ 2 files changed, 68 insertions(+), 31 deletions(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 258c826a844..6dca2580cc4 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -358,11 +358,11 @@ private: void initVirtualParts(); /// Запустить или остановить фоновые потоки. Используется для частичной переинициализации при пересоздании сессии в ZooKeeper. - void startup(); + bool tryStartup(); /// Возвращает false, если недоступен ZooKeeper. void partialShutdown(); /// Запретить запись в таблицу и завершить все фоновые потоки. - void goReadOnly(); + void goReadOnlyPermanently(); /** Проверить, что чексумма куска совпадает с чексуммой того же куска на какой-нибудь другой реплике. diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index fab1636d561..7a0b94be517 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -60,7 +60,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (!attach) throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - goReadOnly(); + goReadOnlyPermanently(); return; } @@ -1950,7 +1950,7 @@ void StorageReplicatedMergeTree::partialShutdown() LOG_TRACE(log, "Threads finished"); } -void StorageReplicatedMergeTree::goReadOnly() +void StorageReplicatedMergeTree::goReadOnlyPermanently() { LOG_INFO(log, "Going to read-only mode"); @@ -1961,33 +1961,55 @@ void StorageReplicatedMergeTree::goReadOnly() partialShutdown(); } -void StorageReplicatedMergeTree::startup() +bool StorageReplicatedMergeTree::tryStartup() { - shutdown_called = false; - shutdown_event.reset(); + try + { + activateReplica(); - merger.uncancelAll(); - if (unreplicated_merger) - unreplicated_merger->uncancelAll(); + leader_election = new zkutil::LeaderElection(zookeeper_path + "/leader_election", *zookeeper, + std::bind(&StorageReplicatedMergeTree::becomeLeader, this), replica_name); - activateReplica(); + /// Все, что выше, может бросить KeeperException, если что-то не так с ZK. + /// Все, что ниже, не должно бросать исключений. - leader_election = new zkutil::LeaderElection(zookeeper_path + "/leader_election", *zookeeper, - std::bind(&StorageReplicatedMergeTree::becomeLeader, this), replica_name); + shutdown_called = false; + shutdown_event.reset(); - queue_updating_thread = std::thread(&StorageReplicatedMergeTree::queueUpdatingThread, this); - cleanup_thread = std::thread(&StorageReplicatedMergeTree::cleanupThread, this); - alter_thread = std::thread(&StorageReplicatedMergeTree::alterThread, this); - part_check_thread = std::thread(&StorageReplicatedMergeTree::partCheckThread, this); - queue_task_handle = context.getBackgroundPool().addTask( - std::bind(&StorageReplicatedMergeTree::queueTask, this, std::placeholders::_1)); + merger.uncancelAll(); + if (unreplicated_merger) + unreplicated_merger->uncancelAll(); + + queue_updating_thread = std::thread(&StorageReplicatedMergeTree::queueUpdatingThread, this); + cleanup_thread = std::thread(&StorageReplicatedMergeTree::cleanupThread, this); + alter_thread = std::thread(&StorageReplicatedMergeTree::alterThread, this); + part_check_thread = std::thread(&StorageReplicatedMergeTree::partCheckThread, this); + queue_task_handle = context.getBackgroundPool().addTask( + std::bind(&StorageReplicatedMergeTree::queueTask, this, std::placeholders::_1)); + return true; + } + catch (zkutil::KeeperException & e) + { + replica_is_active_node = nullptr; + leader_election = nullptr; + LOG_ERROR(log, "Couldn't start replication: " << e.what() << ", " << e.displayText() << ", stack trace:\n" + << e.getStackTrace().toString()); + return false; + } + catch (...) + { + replica_is_active_node = nullptr; + leader_election = nullptr; + throw; + } } void StorageReplicatedMergeTree::restartingThread() { try { - startup(); + while (!permanent_shutdown_called && !tryStartup()) + restarting_event.tryWait(10 * 1000); while (!permanent_shutdown_called) { @@ -1995,16 +2017,28 @@ void StorageReplicatedMergeTree::restartingThread() { LOG_WARNING(log, "ZooKeeper session has expired. Switching to a new session."); - /// Запретим писать в таблицу, пока подменяем zookeeper. - LOG_TRACE(log, "Locking INSERTs"); - auto structure_lock = lockDataForAlter(); - LOG_TRACE(log, "Locked INSERTs"); + { + /// Запретим писать в таблицу, пока подменяем zookeeper. + auto structure_lock = lockDataForAlter(); - partialShutdown(); + partialShutdown(); - zookeeper = context.getZooKeeper(); + zookeeper = context.getZooKeeper(); - startup(); + is_read_only = true; + } + + while (!permanent_shutdown_called && !tryStartup()) + restarting_event.tryWait(10 * 1000); + + if (permanent_shutdown_called) + break; + + { + auto structure_lock = lockDataForAlter(); + + is_read_only = false; + } } restarting_event.tryWait(60 * 1000); @@ -2013,8 +2047,8 @@ void StorageReplicatedMergeTree::restartingThread() catch (...) { tryLogCurrentException("StorageReplicatedMergeTree::restartingThread"); - LOG_ERROR(log, "Exception in restartingThread. The storage will be read-only until server restart."); - goReadOnly(); + LOG_ERROR(log, "Unexpected exception in restartingThread. The storage will be read-only until server restart."); + goReadOnlyPermanently(); LOG_DEBUG(log, "restarting thread finished"); return; } @@ -2159,6 +2193,9 @@ void StorageReplicatedMergeTree::alter(const AlterCommands & params, { auto table_lock = lockStructureForAlter(); + if (is_read_only) + throw Exception("Can't ALTER read-only table", ErrorCodes::TABLE_IS_READ_ONLY); + data.checkAlter(params); new_columns = data.getColumnsList(); @@ -2411,8 +2448,8 @@ void StorageReplicatedMergeTree::attachPartition(const Field & field, bool unrep void StorageReplicatedMergeTree::drop() { - if (!zookeeper) - throw Exception("Can't drop replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + if (is_read_only) + throw Exception("Can't drop read-only replicated table (need to drop data in ZooKeeper as well)", ErrorCodes::TABLE_IS_READ_ONLY); shutdown(); From 5bbb6dc090734d423b22780bb8522bcf759332a7 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 12 Aug 2014 13:35:15 +0400 Subject: [PATCH 067/127] some grammar fixes. [#METR-2807] --- dbms/src/Functions/FunctionsMiscellaneous.cpp | 2 +- dbms/src/Interpreters/Aggregator.cpp | 2 +- dbms/src/Server/OLAPQueryParser.cpp | 2 +- dbms/src/Server/TCPHandler.cpp | 2 +- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 2 +- libs/libmysqlxx/src/Connection.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dbms/src/Functions/FunctionsMiscellaneous.cpp b/dbms/src/Functions/FunctionsMiscellaneous.cpp index db77bed353e..8b42a218fda 100644 --- a/dbms/src/Functions/FunctionsMiscellaneous.cpp +++ b/dbms/src/Functions/FunctionsMiscellaneous.cpp @@ -226,7 +226,7 @@ void FunctionVisibleWidth::execute(Block & block, const ColumnNumbers & argument } else if (const ColumnTuple * col = typeid_cast(&*column)) { - /// Посчитаем видимую ширину для каждого вложенного столбца по-отдельности, и просуммируем. + /// Посчитаем видимую ширину для каждого вложенного столбца по отдельности, и просуммируем. Block nested_block = col->getData(); size_t columns = nested_block.columns(); diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 416e59d2562..43e56a77f2a 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -206,7 +206,7 @@ void Aggregator::executeImpl( if (overflow && !overflow_row) continue; - /// Если вставили новый ключ - инициализируем состояния агрегатных функций, и возможно, что-нибудь связанное с ключём. + /// Если вставили новый ключ - инициализируем состояния агрегатных функций, и возможно, что-нибудь связанное с ключом. if (inserted) { method.onNewKey(it, keys_size, i, keys, *aggregates_pool); diff --git a/dbms/src/Server/OLAPQueryParser.cpp b/dbms/src/Server/OLAPQueryParser.cpp index 30f6491d6ba..e21facfe2fa 100644 --- a/dbms/src/Server/OLAPQueryParser.cpp +++ b/dbms/src/Server/OLAPQueryParser.cpp @@ -166,7 +166,7 @@ QueryParseResult QueryParser::parse(std::istream & s) else if (settings_child_nodes->item(i)->nodeName() == "max_threads_per_counter") { /** Выставить локальное ограничение на максимальное количество обрабатываемых запросов - * Оно может быть больше, чем ограничение по-умолчанию. + * Оно может быть больше, чем ограничение по умолчанию. */ result.max_threads_per_counter = DB::parse(settings_child_nodes->item(i)->innerText()); } diff --git a/dbms/src/Server/TCPHandler.cpp b/dbms/src/Server/TCPHandler.cpp index 596a69d8e38..f975d06163f 100644 --- a/dbms/src/Server/TCPHandler.cpp +++ b/dbms/src/Server/TCPHandler.cpp @@ -68,7 +68,7 @@ void TCPHandler::runImpl() throw; } - /// При соединении может быть указана БД по-умолчанию. + /// При соединении может быть указана БД по умолчанию. if (!default_database.empty()) { if (!connection_context.isDatabaseExist(default_database)) diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index 55d0805e65a..c8a57091c2d 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -153,7 +153,7 @@ void MergeTreeData::loadDataParts() if (part->level == 0) { /// Восстановить куски нулевого уровня невозможно. - LOG_ERROR(log, "Removing broken part " << full_path + file_name << " because is't impossible to repair."); + LOG_ERROR(log, "Removing broken part " << full_path + file_name << " because it's impossible to repair."); broken_parts_to_remove.push_back(part); } else diff --git a/libs/libmysqlxx/src/Connection.cpp b/libs/libmysqlxx/src/Connection.cpp index c3f423f7ff8..2029de28575 100644 --- a/libs/libmysqlxx/src/Connection.cpp +++ b/libs/libmysqlxx/src/Connection.cpp @@ -69,7 +69,7 @@ void Connection::connect(const char* db, if (!mysql_real_connect(&driver, server, user, password, db, port, nullptr, driver.client_flag)) throw ConnectionFailed(errorMessage(&driver), mysql_errno(&driver)); - /// Установим кодировки по-умолчанию - UTF-8. + /// Установим кодировки по умолчанию - UTF-8. if (mysql_set_character_set(&driver, "UTF8")) throw ConnectionFailed(errorMessage(&driver), mysql_errno(&driver)); From a37326766006fc5b9a7c778e20c43c4e51a19498 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 12 Aug 2014 16:41:39 +0400 Subject: [PATCH 068/127] Merge --- dbms/include/DB/Storages/StorageReplicatedMergeTree.h | 2 +- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h index 6dca2580cc4..bd8fd81a525 100644 --- a/dbms/include/DB/Storages/StorageReplicatedMergeTree.h +++ b/dbms/include/DB/Storages/StorageReplicatedMergeTree.h @@ -345,7 +345,7 @@ private: /** Проверить, что список столбцов и настройки таблицы совпадают с указанными в ZK (/metadata). * Если нет - бросить исключение. */ - void checkTableStructure(bool skip_sanity_checks); + void checkTableStructure(bool skip_sanity_checks, bool allow_alter); /** Проверить, что множество кусков соответствует тому, что в ZK (/replicas/me/parts/). * Если каких-то кусков, описанных в ZK нет локально, бросить исключение. diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 7a0b94be517..bc6c1465e26 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -72,7 +72,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( { createTableIfNotExists(); - checkTableStructure(false); + checkTableStructure(false, false); createReplica(); } else @@ -87,7 +87,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( << replica_path << "/flags/force_restore_data)."); } - checkTableStructure(skip_sanity_checks); + checkTableStructure(skip_sanity_checks, true); checkParts(skip_sanity_checks); } @@ -200,7 +200,7 @@ void StorageReplicatedMergeTree::createTableIfNotExists() /** Проверить, что список столбцов и настройки таблицы совпадают с указанными в ZK (/metadata). * Если нет - бросить исключение. */ -void StorageReplicatedMergeTree::checkTableStructure(bool skip_sanity_checks) +void StorageReplicatedMergeTree::checkTableStructure(bool skip_sanity_checks, bool allow_alter) { String metadata_str = zookeeper->get(zookeeper_path + "/metadata"); ReadBufferFromString buf(metadata_str); @@ -227,7 +227,7 @@ void StorageReplicatedMergeTree::checkTableStructure(bool skip_sanity_checks) columns_version = stat.version; if (columns != data.getColumnsList()) { - if (data.getColumnsList().sizeOfDifference(columns) <= 2 || skip_sanity_checks) + if (allow_alter && (data.getColumnsList().sizeOfDifference(columns) <= 2 || skip_sanity_checks)) { LOG_WARNING(log, "Table structure in ZooKeeper is a little different from local table structure. Assuming ALTER."); @@ -237,7 +237,7 @@ void StorageReplicatedMergeTree::checkTableStructure(bool skip_sanity_checks) } else { - throw Exception("Table structure in ZooKeeper is very different from local table structure.", + throw Exception("Table structure in ZooKeeper is too different from local table structure.", ErrorCodes::INCOMPATIBLE_COLUMNS); } } From d9ae9876cee0a740be256b16c23ef266c2df55c7 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 13 Aug 2014 12:07:52 +0400 Subject: [PATCH 069/127] Merge --- .../DB/Storages/MergeTree/MergeTreeData.h | 8 ++--- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 16 +++++---- dbms/src/Storages/StorageMergeTree.cpp | 1 + .../Storages/StorageReplicatedMergeTree.cpp | 33 +++++++++++-------- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h index b4eba1a978b..114158f5c65 100644 --- a/dbms/include/DB/Storages/MergeTree/MergeTreeData.h +++ b/dbms/include/DB/Storages/MergeTree/MergeTreeData.h @@ -100,7 +100,7 @@ struct MergeTreeSettings size_t max_rows_to_use_cache = 1024 * 1024; /// Через сколько секунд удалять ненужные куски. - time_t old_parts_lifetime = 5 * 60; + time_t old_parts_lifetime = 8 * 60; /// Если в таблице хотя бы столько активных кусков, искусственно замедлять вставки в таблицу. size_t parts_to_delay_insert = 150; @@ -576,6 +576,9 @@ public: bool require_part_metadata_, BrokenPartCallback broken_part_callback_ = &MergeTreeData::doNothing); + /// Загрузить множество кусков с данными с диска. Вызывается один раз - сразу после создания объекта. + void loadDataParts(bool skip_sanity_checks); + std::string getModePrefix() const; bool supportsSampling() const { return !!sampling_expression; } @@ -753,9 +756,6 @@ private: DataParts all_data_parts; Poco::FastMutex all_data_parts_mutex; - /// Загрузить множество кусков с данными с диска. Вызывается один раз - при создании объекта. - void loadDataParts(); - /** Выражение, преобразующее типы столбцов. * Если преобразований типов нет, out_expression=nullptr. * out_rename_map отображает файлы-столбцы на выходе выражения в новые файлы таблицы. diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index c8a57091c2d..3abf3cc735b 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -56,8 +56,6 @@ MergeTreeData::MergeTreeData( ExpressionActionsPtr projected_expr = ExpressionAnalyzer(primary_expr_ast, context, *columns).getActions(true); primary_key_sample = projected_expr->getSampleBlock(); - - loadDataParts(); } UInt64 MergeTreeData::getMaxDataPartIndex() @@ -85,7 +83,7 @@ std::string MergeTreeData::getModePrefix() const } -void MergeTreeData::loadDataParts() +void MergeTreeData::loadDataParts(bool skip_sanity_checks) { LOG_DEBUG(log, "Loading data parts"); @@ -121,6 +119,8 @@ void MergeTreeData::loadDataParts() } DataPartsVector broken_parts_to_remove; + DataPartsVector broken_parts_to_detach; + size_t suspicious_broken_parts = 0; Poco::RegularExpression::MatchVec matches; for (const String & file_name : part_file_names) @@ -163,6 +163,7 @@ void MergeTreeData::loadDataParts() int contained_parts = 0; LOG_ERROR(log, "Part " << full_path + file_name << " is broken. Looking for parts to replace it."); + ++suspicious_broken_parts; for (const String & contained_name : part_file_names) { @@ -186,8 +187,9 @@ void MergeTreeData::loadDataParts() } else { - LOG_ERROR(log, "Not removing broken part " << full_path + file_name + LOG_ERROR(log, "Detaching broken part " << full_path + file_name << " because it covers less than 2 parts. You need to resolve this manually"); + broken_parts_to_detach.push_back(part); } } @@ -199,12 +201,14 @@ void MergeTreeData::loadDataParts() data_parts.insert(part); } - if (broken_parts_to_remove.size() > 2) - throw Exception("Suspiciously many (" + toString(broken_parts_to_remove.size()) + ") broken parts to remove.", + if (suspicious_broken_parts > 5 && !skip_sanity_checks) + throw Exception("Suspiciously many (" + toString(suspicious_broken_parts) + ") broken parts to remove.", ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS); for (const auto & part : broken_parts_to_remove) part->remove(); + for (const auto & part : broken_parts_to_detach) + part->renameAddPrefix("detached/"); all_data_parts = data_parts; diff --git a/dbms/src/Storages/StorageMergeTree.cpp b/dbms/src/Storages/StorageMergeTree.cpp index dc02a613588..3e159b128b4 100644 --- a/dbms/src/Storages/StorageMergeTree.cpp +++ b/dbms/src/Storages/StorageMergeTree.cpp @@ -27,6 +27,7 @@ StorageMergeTree::StorageMergeTree(const String & path_, const String & database { increment.fixIfBroken(data.getMaxDataPartIndex()); + data.loadDataParts(false); data.clearOldParts(); } diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index bc6c1465e26..a77059bcd7d 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -55,6 +55,23 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( log(&Logger::get(database_name_ + "." + table_name + " (StorageReplicatedMergeTree)")), shutdown_event(false) { + if (!zookeeper_path.empty() && *zookeeper_path.rbegin() == '/') + zookeeper_path.erase(zookeeper_path.end() - 1); + replica_path = zookeeper_path + "/replicas/" + replica_name; + + bool skip_sanity_checks = false; + + if (zookeeper && zookeeper->exists(replica_path + "/flags/force_restore_data")) + { + skip_sanity_checks = true; + zookeeper->remove(replica_path + "/flags/force_restore_data"); + + LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag " + << replica_path << "/flags/force_restore_data)."); + } + + data.loadDataParts(skip_sanity_checks); + if (!zookeeper) { if (!attach) @@ -64,10 +81,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( return; } - if (!zookeeper_path.empty() && *zookeeper_path.rbegin() == '/') - zookeeper_path.erase(zookeeper_path.end() - 1); - replica_path = zookeeper_path + "/replicas/" + replica_name; - if (!attach) { createTableIfNotExists(); @@ -77,16 +90,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } else { - bool skip_sanity_checks = false; - if (zookeeper->exists(replica_path + "/flags/force_restore_data")) - { - skip_sanity_checks = true; - zookeeper->remove(replica_path + "/flags/force_restore_data"); - - LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag " - << replica_path << "/flags/force_restore_data)."); - } - checkTableStructure(skip_sanity_checks, true); checkParts(skip_sanity_checks); } @@ -444,6 +447,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) } else { + LOG_ERROR(log, "Fetching missing part " << missing_name); parts_to_fetch.push_back(missing_name); } } @@ -1986,6 +1990,7 @@ bool StorageReplicatedMergeTree::tryStartup() part_check_thread = std::thread(&StorageReplicatedMergeTree::partCheckThread, this); queue_task_handle = context.getBackgroundPool().addTask( std::bind(&StorageReplicatedMergeTree::queueTask, this, std::placeholders::_1)); + queue_task_handle->wake(); return true; } catch (zkutil::KeeperException & e) From f0d2d918c7f960e3630008b6f6982e5157e31e33 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 13 Aug 2014 12:53:38 +0400 Subject: [PATCH 070/127] Merge --- .../MergeTree/MergeTreePartChecker.cpp | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp index 6382f2624bf..ef1b36776fd 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,8 @@ namespace DB struct Stream { + static const size_t UNKNOWN = std::numeric_limits::max(); + DataTypePtr type; String path; String name; @@ -35,6 +38,12 @@ struct Stream return mrk_hashing_buf.eof(); } + void ignore() + { + uncompressed_hashing_buf.ignore(std::numeric_limits::max()); + mrk_hashing_buf.ignore(std::numeric_limits::max()); + } + size_t read(size_t rows) { if (dynamic_cast(&*type)) @@ -197,6 +206,12 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr return rows; } + else if (dynamic_cast(&*type)) + { + Stream data_stream(path, escapeForFileName(name), type); + data_stream.ignore(); + return Stream::UNKNOWN; + } else { Stream data_stream(path, escapeForFileName(name), type); @@ -211,7 +226,10 @@ static size_t checkColumn(const String & path, const String & name, DataTypePtr size_t cur_rows = data_stream.read(settings.index_granularity); - rows += cur_rows; + if (cur_rows == Stream::UNKNOWN) + rows = Stream::UNKNOWN; + else + rows += cur_rows; if (cur_rows < settings.index_granularity) break; } @@ -260,8 +278,8 @@ void MergeTreePartChecker::checkDataPart(String path, const Settings & settings, checksums_data.files["primary.idx"] = MergeTreeData::DataPart::Checksums::Checksum(primary_idx_size, hashing_buf.getHash()); } - bool first = true; - size_t rows = 0; + String any_column_name; + size_t rows = Stream::UNKNOWN; ExceptionPtr first_exception; for (const NameAndTypePair & column : columns) @@ -283,15 +301,18 @@ void MergeTreePartChecker::checkDataPart(String path, const Settings & settings, } size_t cur_rows = checkColumn(path, column.name, column.type, settings, checksums_data); - if (first) + if (cur_rows != Stream::UNKNOWN) { - rows = cur_rows; - first = false; - } - else if (rows != cur_rows) - { - throw Exception("Different number of rows in columns " + columns.begin()->name + " and " + column.name, - ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + if (rows == Stream::UNKNOWN) + { + rows = cur_rows; + any_column_name = column.name; + } + else if (rows != cur_rows) + { + throw Exception("Different number of rows in columns " + any_column_name + " and " + column.name, + ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + } } ok = true; @@ -315,7 +336,7 @@ void MergeTreePartChecker::checkDataPart(String path, const Settings & settings, std::cerr << " ok" << std::endl; } - if (first) + if (rows == Stream::UNKNOWN) throw Exception("No columns", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); if (primary_idx_size % ((rows - 1) / settings.index_granularity + 1)) From cb94f1301c555f976b50e8dca1348c845885705c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Aug 2014 05:37:06 +0400 Subject: [PATCH 071/127] dbms: removed comment [#METR-2944]. --- dbms/include/DB/IO/WriteHelpers.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dbms/include/DB/IO/WriteHelpers.h b/dbms/include/DB/IO/WriteHelpers.h index 4b781f61253..42d2ad1ab96 100644 --- a/dbms/include/DB/IO/WriteHelpers.h +++ b/dbms/include/DB/IO/WriteHelpers.h @@ -149,7 +149,7 @@ inline void writeJSONString(const char * begin, const char * end, WriteBuffer & case '\t': writeChar('\\', buf); writeChar('t', buf); - break; + break; case '\\': writeChar('\\', buf); writeChar('\\', buf); @@ -167,10 +167,10 @@ inline void writeJSONString(const char * begin, const char * end, WriteBuffer & { char higher_half = (*it) >> 4; char lower_half = (*it) & 0xF; - + writeCString("\\u00", buf); writeChar('0' + higher_half, buf); - + if (0 <= lower_half && lower_half <= 9) writeChar('0' + lower_half, buf); else @@ -282,7 +282,6 @@ inline void writeQuotedString(const String & s, WriteBuffer & buf) writeAnyQuotedString<'\''>(s, buf); } -/// Совместимо с JSON. inline void writeDoubleQuotedString(const String & s, WriteBuffer & buf) { writeAnyQuotedString<'"'>(s, buf); @@ -335,7 +334,7 @@ inline void writeDateText(DayNum_t date, WriteBuffer & buf) s[6] += values.month % 10; s[8] += values.day_of_month / 10; s[9] += values.day_of_month % 10; - + buf.write(s, 10); } From 7af08f534719beb18987dcafdfc763f082cbeb51 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Aug 2014 05:37:46 +0400 Subject: [PATCH 072/127] Merge --- .../MergeTree/ReplicatedMergeTreePartsExchange.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp index e97ebc7381c..8386a3640f4 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartsExchange.cpp @@ -73,8 +73,15 @@ MergeTreeData::MutableDataPartPtr ReplicatedMergeTreePartsFetcher::fetchPart( ReadBufferFromHTTP in(host, port, params); String part_path = data.getFullPath() + "tmp_" + part_name + "/"; - if (!Poco::File(part_path).createDirectory()) - throw Exception("Directory " + part_path + " already exists"); + Poco::File part_file(part_path); + + if (part_file.exists()) + { + LOG_ERROR(log, "Directory " + part_path + " already exists. Removing."); + part_file.remove(true); + } + + part_file.createDirectory(); MergeTreeData::MutableDataPartPtr new_data_part = std::make_shared(data); new_data_part->name = "tmp_" + part_name; From 1cd1e5f72427e3a068fb63e4b8e55b20bfc5b660 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Aug 2014 05:48:50 +0400 Subject: [PATCH 073/127] dbms: BackgroundProcessingPool: less messy; fixed error(s) [#METR-12317]. --- .../MergeTree/BackgroundProcessingPool.h | 147 +++++++----------- 1 file changed, 53 insertions(+), 94 deletions(-) diff --git a/dbms/include/DB/Storages/MergeTree/BackgroundProcessingPool.h b/dbms/include/DB/Storages/MergeTree/BackgroundProcessingPool.h index de2b10cf11e..b846a1e037d 100644 --- a/dbms/include/DB/Storages/MergeTree/BackgroundProcessingPool.h +++ b/dbms/include/DB/Storages/MergeTree/BackgroundProcessingPool.h @@ -16,15 +16,21 @@ namespace DB { -/** В нескольких потоках в бесконечном цикле выполняет указанные функции. +/** Используя фиксированное количество потоков, выполнять произвольное количество задач в бесконечном цикле. + * Предназначена для задач, выполняющих постоянную фоновую работу (например, слияния). + * Задача - функция, возвращающая bool - сделала ли она какую-либо работу. + * Если сделала - надо выполнить ещё раз. Если нет - надо подождать несколько секунд, или до события wake, и выполнить ещё раз. + * + * Также, задача во время выполнения может временно увеличить какой-либо счётчик, относящийся ко всем задачам + * - например, число одновременно идующих слияний. */ class BackgroundProcessingPool { public: typedef std::map Counters; - /** Используется изнутри таски. Позволяет инкрементировать какие-нибудь счетчики. - * После завершения таски, все изменения откатятся. + /** Используется изнутри задачи. Позволяет инкрементировать какие-нибудь счетчики. + * После завершения задачи, все изменения откатятся. * Например, чтобы можно было узнавать количество потоков, выполняющих большое слияние, * можно в таске, выполняющей большое слияние, инкрементировать счетчик. Декрементировать обратно его не нужно. */ @@ -57,10 +63,14 @@ public: /// Переставить таск в начало очереди и разбудить какой-нибудь поток. void wake() { + Poco::ScopedReadRWLock rlock(rwlock); + if (removed) + return; + std::unique_lock lock(pool.mutex); pool.tasks.splice(pool.tasks.begin(), pool.tasks, iterator); - /// Не очень надежно: если все потоки сейчас выполняют работу, этот вызов никого не разбудит, + /// Не очень надёжно: если все потоки сейчас выполняют работу, этот вызов никого не разбудит, /// и все будут спать в конце итерации. pool.wake_event.notify_one(); } @@ -70,50 +80,32 @@ public: BackgroundProcessingPool & pool; Task function; - Poco::RWLock lock; - volatile bool removed; + + /// При выполнении задачи, держится read lock. Переменная removed меняется под write lock-ом. + Poco::RWLock rwlock; + volatile bool removed = false; + std::list>::iterator iterator; - TaskInfo(BackgroundProcessingPool & pool_, const Task & function_) : pool(pool_), function(function_), removed(false) {} + TaskInfo(BackgroundProcessingPool & pool_, const Task & function_) : pool(pool_), function(function_) {} }; typedef std::shared_ptr TaskHandle; - BackgroundProcessingPool(int size_) : size(size_), sleep_seconds(10), shutdown(false) {} - - void setNumberOfThreads(int size_) + BackgroundProcessingPool(int size_) : size(size_) { - if (size_ <= 0) - throw Exception("Invalid number of threads: " + toString(size_), ErrorCodes::ARGUMENT_OUT_OF_BOUND); - - std::unique_lock tlock(threads_mutex); - std::unique_lock lock(mutex); - - if (size_ == size) - return; - - if (threads.empty()) - { - size = size_; - return; - } - - throw Exception("setNumberOfThreads is not implemented for non-empty pool", ErrorCodes::NOT_IMPLEMENTED); + threads.resize(size); + for (auto & thread : threads) + thread = std::thread([this] { threadFunction(); }); } - int getNumberOfThreads() + + int getNumberOfThreads() const { - std::unique_lock lock(mutex); return size; } - void setSleepTime(double seconds) - { - std::unique_lock lock(mutex); - sleep_seconds = seconds; - } - int getCounter(const String & name) { std::unique_lock lock(mutex); @@ -122,8 +114,6 @@ public: TaskHandle addTask(const Task & task) { - std::unique_lock lock(threads_mutex); - TaskHandle res(new TaskInfo(*this, task)); { @@ -132,44 +122,22 @@ public: res->iterator = --tasks.end(); } - if (threads.empty()) - { - shutdown = false; - counters.clear(); - threads.resize(size); - for (std::thread & thread : threads) - thread = std::thread(std::bind(&BackgroundProcessingPool::threadFunction, this)); - } + wake_event.notify_all(); return res; } void removeTask(const TaskHandle & task) { - std::unique_lock tlock(threads_mutex); - - /// Дождемся завершения всех выполнений этой задачи. + /// Дождёмся завершения всех выполнений этой задачи. { - Poco::ScopedWriteRWLock wlock(task->lock); + Poco::ScopedWriteRWLock wlock(task->rwlock); task->removed = true; } { std::unique_lock lock(mutex); - auto it = std::find(tasks.begin(), tasks.end(), task); - if (it == tasks.end()) - throw Exception("Task not found", ErrorCodes::LOGICAL_ERROR); - tasks.erase(it); - } - - if (tasks.empty()) - { - shutdown = true; - wake_event.notify_all(); - for (std::thread & thread : threads) - thread.join(); - threads.clear(); - counters.clear(); + tasks.erase(task->iterator); } } @@ -177,15 +145,10 @@ public: { try { - std::unique_lock lock(threads_mutex); - if (!threads.empty()) - { - LOG_ERROR(&Logger::get("~BackgroundProcessingPool"), "Destroying non-empty BackgroundProcessingPool"); - shutdown = true; - wake_event.notify_all(); - for (std::thread & thread : threads) - thread.join(); - } + shutdown = true; + wake_event.notify_all(); + for (std::thread & thread : threads) + thread.join(); } catch (...) { @@ -197,24 +160,25 @@ private: typedef std::list Tasks; typedef std::vector Threads; - std::mutex threads_mutex; - std::mutex mutex; - int size; - Tasks tasks; /// Таски в порядке, в котором мы планируем их выполнять. - Threads threads; - Counters counters; - double sleep_seconds; + const size_t size; + static constexpr double sleep_seconds = 10; - volatile bool shutdown; + Tasks tasks; /// Задачи в порядке, в котором мы планируем их выполнять. + Counters counters; + std::mutex mutex; /// Для работы со списком tasks, а также с counters (когда threads не пустой). + + Threads threads; + + volatile bool shutdown = false; std::condition_variable wake_event; + void threadFunction() { while (!shutdown) { Counters counters_diff; bool need_sleep = false; - size_t tasks_count = 1; try { @@ -236,11 +200,12 @@ private: if (!task) { - std::this_thread::sleep_for(std::chrono::duration(sleep_seconds)); + std::unique_lock lock(mutex); + wake_event.wait_for(lock, std::chrono::duration(sleep_seconds)); continue; } - Poco::ScopedReadRWLock rlock(task->lock); + Poco::ScopedReadRWLock rlock(task->rwlock); if (task->removed) continue; @@ -248,15 +213,11 @@ private: if (task->function(context)) { - /// Если у таска получилось выполнить какую-то работу, запустим его снова без паузы. - std::unique_lock lock(mutex); + /// Если у задачи получилось выполнить какую-то работу, запустим её снова без паузы. + need_sleep = false; - auto it = std::find(tasks.begin(), tasks.end(), task); - if (it != tasks.end()) - { - need_sleep = false; - tasks.splice(tasks.begin(), tasks, it); - } + std::unique_lock lock(mutex); + tasks.splice(tasks.begin(), tasks, task->iterator); } } catch (...) @@ -265,14 +226,12 @@ private: tryLogCurrentException(__PRETTY_FUNCTION__); } - /// Вычтем все счетчики обратно. + /// Вычтем все счётчики обратно. if (!counters_diff.empty()) { std::unique_lock lock(mutex); for (const auto & it : counters_diff) - { counters[it.first] -= it.second; - } } if (shutdown) @@ -281,7 +240,7 @@ private: if (need_sleep) { std::unique_lock lock(mutex); - wake_event.wait_for(lock, std::chrono::duration(sleep_seconds / tasks_count)); + wake_event.wait_for(lock, std::chrono::duration(sleep_seconds)); } } } From fd9668be1208bf961ea9c82e3bd7b4a550b30482 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Aug 2014 00:00:19 +0400 Subject: [PATCH 074/127] dbms: Client: fixed error with part of JSON output after exception [#METR-12316]. --- dbms/src/Client/Client.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp index d44786ab023..9ecb779b97e 100644 --- a/dbms/src/Client/Client.cpp +++ b/dbms/src/Client/Client.cpp @@ -441,7 +441,7 @@ private: if (exit_strings.end() != exit_strings.find(line)) return false; - block_std_out = nullptr; + resetOutput(); watch.restart(); @@ -642,6 +642,14 @@ private: } + /** Сбросить все данные, что ещё остались в буферах. */ + void resetOutput() + { + block_std_out = nullptr; + std_out.next(); + } + + /** Получает и обрабатывает пакеты из сервера. * Также следит, не требуется ли прервать выполнение запроса. */ @@ -780,6 +788,11 @@ private: written_first_block = true; } + /** Это обычно приводит к тому, что полученный блок данных выводится клиенту. + * Но не всегда. Например, JSONRowOutputStream пишет данные сначала в WriteBufferValidUTF8, + * которые ещё немного буферизует данные перед записью в std_out. + * Поэтому, вызов std_out.next() может записать не все данные. + */ std_out.next(); } @@ -859,6 +872,8 @@ private: void onException(const Exception & e) { + resetOutput(); + std::cerr << "Received exception from server:" << std::endl << "Code: " << e.code() << ". " << e.displayText(); } @@ -876,7 +891,7 @@ private: if (block_std_out) block_std_out->writeSuffix(); - std_out.next(); + resetOutput(); if (is_interactive && !written_first_block) std::cout << "Ok." << std::endl; From 65cf115313b2e9b1a577ad5ee2c1d61fa6b52218 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Aug 2014 00:27:41 +0400 Subject: [PATCH 075/127] dbms: fixed error (progress message in the middle of data, in JSON formats) [#METR-11125]. --- .../DB/DataStreams/AddingDefaultBlockOutputStream.h | 2 ++ dbms/include/DB/DataStreams/BinaryRowOutputStream.h | 2 ++ .../DB/DataStreams/BlockOutputStreamFromRowOutputStream.h | 2 ++ dbms/include/DB/DataStreams/IBlockOutputStream.h | 4 ++++ dbms/include/DB/DataStreams/IRowOutputStream.h | 3 +++ dbms/include/DB/DataStreams/JSONRowOutputStream.h | 5 ++++- dbms/include/DB/DataStreams/NativeBlockOutputStream.h | 2 ++ dbms/include/DB/DataStreams/PrettyBlockOutputStream.h | 2 ++ .../DB/DataStreams/PushingToViewsBlockOutputStream.h | 4 ++-- .../DB/DataStreams/TabSeparatedBlockOutputStream.h | 2 ++ dbms/include/DB/DataStreams/TabSeparatedRowOutputStream.h | 2 ++ dbms/include/DB/DataStreams/ValuesRowOutputStream.h | 2 ++ dbms/include/DB/DataStreams/VerticalRowOutputStream.h | 2 ++ dbms/src/Client/Client.cpp | 8 ++------ dbms/src/DataStreams/JSONRowOutputStream.cpp | 2 +- 15 files changed, 34 insertions(+), 10 deletions(-) diff --git a/dbms/include/DB/DataStreams/AddingDefaultBlockOutputStream.h b/dbms/include/DB/DataStreams/AddingDefaultBlockOutputStream.h index 643a99270c4..c3a49cece4c 100644 --- a/dbms/include/DB/DataStreams/AddingDefaultBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/AddingDefaultBlockOutputStream.h @@ -33,6 +33,8 @@ public: output->write(res); } + void flush() { output->flush(); } + private: BlockOutputStreamPtr output; NamesAndTypesListPtr required_columns; diff --git a/dbms/include/DB/DataStreams/BinaryRowOutputStream.h b/dbms/include/DB/DataStreams/BinaryRowOutputStream.h index df8ab4a4a99..09f4383437e 100644 --- a/dbms/include/DB/DataStreams/BinaryRowOutputStream.h +++ b/dbms/include/DB/DataStreams/BinaryRowOutputStream.h @@ -20,6 +20,8 @@ public: void writeField(const Field & field); void writeRowEndDelimiter(); + void flush() { ostr.next(); } + protected: WriteBuffer & ostr; const Block sample; diff --git a/dbms/include/DB/DataStreams/BlockOutputStreamFromRowOutputStream.h b/dbms/include/DB/DataStreams/BlockOutputStreamFromRowOutputStream.h index 70213b5d2d4..e6d15c7b323 100644 --- a/dbms/include/DB/DataStreams/BlockOutputStreamFromRowOutputStream.h +++ b/dbms/include/DB/DataStreams/BlockOutputStreamFromRowOutputStream.h @@ -17,6 +17,8 @@ public: void write(const Block & block); void writePrefix() { row_output->writePrefix(); } void writeSuffix() { row_output->writeSuffix(); } + + void flush() { row_output->flush(); } void setRowsBeforeLimit(size_t rows_before_limit); void setTotals(const Block & totals); diff --git a/dbms/include/DB/DataStreams/IBlockOutputStream.h b/dbms/include/DB/DataStreams/IBlockOutputStream.h index 72769711008..ad19e50d4ba 100644 --- a/dbms/include/DB/DataStreams/IBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/IBlockOutputStream.h @@ -31,6 +31,10 @@ public: */ virtual void writePrefix() {} virtual void writeSuffix() {} + + /** Сбросить имеющиеся буферы для записи. + */ + virtual void flush() {} /** Методы для установки дополнительной информации для вывода в поддерживающих её форматах. */ diff --git a/dbms/include/DB/DataStreams/IRowOutputStream.h b/dbms/include/DB/DataStreams/IRowOutputStream.h index d58fb50ae64..eec6c7a9201 100644 --- a/dbms/include/DB/DataStreams/IRowOutputStream.h +++ b/dbms/include/DB/DataStreams/IRowOutputStream.h @@ -32,6 +32,9 @@ public: virtual void writePrefix() {}; /// разделитель перед началом результата virtual void writeSuffix() {}; /// разделитель после конца результата + /** Сбросить имеющиеся буферы для записи. */ + virtual void flush() {} + /** Методы для установки дополнительной информации для вывода в поддерживающих её форматах. */ virtual void setRowsBeforeLimit(size_t rows_before_limit) {} diff --git a/dbms/include/DB/DataStreams/JSONRowOutputStream.h b/dbms/include/DB/DataStreams/JSONRowOutputStream.h index 224dbbadf76..b96dde4c3ae 100644 --- a/dbms/include/DB/DataStreams/JSONRowOutputStream.h +++ b/dbms/include/DB/DataStreams/JSONRowOutputStream.h @@ -25,6 +25,8 @@ public: void writeRowEndDelimiter(); void writePrefix(); void writeSuffix(); + + void flush() { ostr.next(); dst_ostr.next(); } void setRowsBeforeLimit(size_t rows_before_limit_) { @@ -41,7 +43,8 @@ protected: virtual void writeTotals(); virtual void writeExtremes(); - WriteBufferValidUTF8 ostr; + WriteBuffer & dst_ostr; + WriteBufferValidUTF8 ostr; /// Валидирует и пишет в dst_ostr. size_t field_number; size_t row_count; bool applied_limit; diff --git a/dbms/include/DB/DataStreams/NativeBlockOutputStream.h b/dbms/include/DB/DataStreams/NativeBlockOutputStream.h index eb47f2d88a5..ae11058f664 100644 --- a/dbms/include/DB/DataStreams/NativeBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/NativeBlockOutputStream.h @@ -15,6 +15,8 @@ public: NativeBlockOutputStream(WriteBuffer & ostr_) : ostr(ostr_) {} void write(const Block & block); + void flush() { ostr.next(); } + private: WriteBuffer & ostr; }; diff --git a/dbms/include/DB/DataStreams/PrettyBlockOutputStream.h b/dbms/include/DB/DataStreams/PrettyBlockOutputStream.h index 65c8d03050f..eaa820dc170 100644 --- a/dbms/include/DB/DataStreams/PrettyBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/PrettyBlockOutputStream.h @@ -18,6 +18,8 @@ public: void write(const Block & block); void writeSuffix(); + void flush() { ostr.next(); } + void setTotals(const Block & totals_) { totals = totals_; } void setExtremes(const Block & extremes_) { extremes = extremes_; } diff --git a/dbms/include/DB/DataStreams/PushingToViewsBlockOutputStream.h b/dbms/include/DB/DataStreams/PushingToViewsBlockOutputStream.h index c22a4c258f7..58e9264c91f 100644 --- a/dbms/include/DB/DataStreams/PushingToViewsBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/PushingToViewsBlockOutputStream.h @@ -18,8 +18,8 @@ namespace DB class PushingToViewsBlockOutputStream : public IBlockOutputStream { public: - PushingToViewsBlockOutputStream(String database_, String table_, const Context &context_, ASTPtr query_ptr_) - :database(database_), table(table_), context(context_), query_ptr(query_ptr_) + PushingToViewsBlockOutputStream(String database_, String table_, const Context & context_, ASTPtr query_ptr_) + : database(database_), table(table_), context(context_), query_ptr(query_ptr_) { if (database.empty()) database = context.getCurrentDatabase(); diff --git a/dbms/include/DB/DataStreams/TabSeparatedBlockOutputStream.h b/dbms/include/DB/DataStreams/TabSeparatedBlockOutputStream.h index 9bba296b7d5..05d801362f9 100644 --- a/dbms/include/DB/DataStreams/TabSeparatedBlockOutputStream.h +++ b/dbms/include/DB/DataStreams/TabSeparatedBlockOutputStream.h @@ -16,6 +16,8 @@ public: TabSeparatedBlockOutputStream(WriteBuffer & ostr_) : ostr(ostr_) {} void write(const Block & block); + void flush() { ostr.next(); } + private: WriteBuffer & ostr; }; diff --git a/dbms/include/DB/DataStreams/TabSeparatedRowOutputStream.h b/dbms/include/DB/DataStreams/TabSeparatedRowOutputStream.h index 46056042d53..60681d56ed1 100644 --- a/dbms/include/DB/DataStreams/TabSeparatedRowOutputStream.h +++ b/dbms/include/DB/DataStreams/TabSeparatedRowOutputStream.h @@ -26,6 +26,8 @@ public: void writePrefix(); void writeSuffix(); + void flush() { ostr.next(); } + void setTotals(const Block & totals_) { totals = totals_; } void setExtremes(const Block & extremes_) { extremes = extremes_; } diff --git a/dbms/include/DB/DataStreams/ValuesRowOutputStream.h b/dbms/include/DB/DataStreams/ValuesRowOutputStream.h index ba632b62fdb..4cfd5d74c92 100644 --- a/dbms/include/DB/DataStreams/ValuesRowOutputStream.h +++ b/dbms/include/DB/DataStreams/ValuesRowOutputStream.h @@ -26,6 +26,8 @@ public: void writeRowEndDelimiter(); void writeRowBetweenDelimiter(); + void flush() { ostr.next(); } + private: WriteBuffer & ostr; const Block sample; diff --git a/dbms/include/DB/DataStreams/VerticalRowOutputStream.h b/dbms/include/DB/DataStreams/VerticalRowOutputStream.h index dbd9cbb5672..08a9c83145f 100644 --- a/dbms/include/DB/DataStreams/VerticalRowOutputStream.h +++ b/dbms/include/DB/DataStreams/VerticalRowOutputStream.h @@ -25,6 +25,8 @@ public: void writeRowStartDelimiter(); void writeRowBetweenDelimiter(); + void flush() { ostr.next(); } + private: WriteBuffer & ostr; const Block sample; diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp index 9ecb779b97e..f5fd7729980 100644 --- a/dbms/src/Client/Client.cpp +++ b/dbms/src/Client/Client.cpp @@ -788,12 +788,8 @@ private: written_first_block = true; } - /** Это обычно приводит к тому, что полученный блок данных выводится клиенту. - * Но не всегда. Например, JSONRowOutputStream пишет данные сначала в WriteBufferValidUTF8, - * которые ещё немного буферизует данные перед записью в std_out. - * Поэтому, вызов std_out.next() может записать не все данные. - */ - std_out.next(); + /// Полученный блок данных сразу выводится клиенту. + block_std_out->flush(); } diff --git a/dbms/src/DataStreams/JSONRowOutputStream.cpp b/dbms/src/DataStreams/JSONRowOutputStream.cpp index 9c43a8fa546..5cdb6f82c8d 100644 --- a/dbms/src/DataStreams/JSONRowOutputStream.cpp +++ b/dbms/src/DataStreams/JSONRowOutputStream.cpp @@ -10,7 +10,7 @@ using Poco::SharedPtr; JSONRowOutputStream::JSONRowOutputStream(WriteBuffer & ostr_, const Block & sample_) - : ostr(ostr_), field_number(0), row_count(0), applied_limit(false), rows_before_limit(0) + : dst_ostr(ostr_), ostr(dst_ostr), field_number(0), row_count(0), applied_limit(false), rows_before_limit(0) { NamesAndTypesList columns(sample_.getColumnsList()); fields.assign(columns.begin(), columns.end()); From 047e5e30df799c43b98f1746b0ae0251293d8a99 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Aug 2014 04:08:15 +0400 Subject: [PATCH 076/127] dbms: Client: don't leave progress indicator in the middle of data; colors almost compatible with white backgrounds [#METR-11125]. --- dbms/src/Client/Client.cpp | 93 +++++++++++-------- .../DataStreams/PrettyBlockOutputStream.cpp | 2 +- .../PrettyCompactBlockOutputStream.cpp | 14 +-- .../PrettySpaceBlockOutputStream.cpp | 10 +- dbms/src/Parsers/formatAST.cpp | 2 +- 5 files changed, 66 insertions(+), 55 deletions(-) diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp index f5fd7729980..70ccdd02f7f 100644 --- a/dbms/src/Client/Client.cpp +++ b/dbms/src/Client/Client.cpp @@ -49,6 +49,16 @@ #include + +/// http://en.wikipedia.org/wiki/ANSI_escape_code +#define SAVE_CURSOR_POSITION "\033[s" +#define RESTORE_CURSOR_POSITION "\033[u" +#define CLEAR_TO_END_OF_LINE "\033[K" +/// Эти коды, возможно, поддерживаются не везде. +#define DISABLE_LINE_WRAPPING "\033[?7l" +#define ENABLE_LINE_WRAPPING "\033[?7h" + + /** Клиент командной строки СУБД ClickHouse. */ @@ -61,11 +71,7 @@ using Poco::SharedPtr; class Client : public Poco::Util::Application { public: - Client() : is_interactive(true), stdin_is_not_tty(false), - format_max_block_size(0), std_in(STDIN_FILENO), std_out(STDOUT_FILENO), processed_rows(0), - rows_read_on_server(0), bytes_read_on_server(0), written_progress_chars(0), written_first_block(false) - { - } + Client() {} private: typedef std::unordered_set StringSet; @@ -77,24 +83,24 @@ private: "q", "й", "\\q", "\\Q", "\\й", "\\Й", ":q", "Жй" }; - bool is_interactive; /// Использовать readline интерфейс или batch режим. - bool stdin_is_not_tty; /// stdin - не терминал. + bool is_interactive = true; /// Использовать readline интерфейс или batch режим. + bool stdin_is_not_tty = false; /// stdin - не терминал. SharedPtr connection; /// Соединение с БД. String query; /// Текущий запрос. String format; /// Формат вывода результата в консоль. - size_t format_max_block_size; /// Максимальный размер блока при выводе в консоль. + size_t format_max_block_size = 0; /// Максимальный размер блока при выводе в консоль. String insert_format; /// Формат данных для INSERT-а при чтении их из stdin в batch режиме - size_t insert_format_max_block_size; /// Максимальный размер блока при чтении данных INSERT-а. + size_t insert_format_max_block_size = 0; /// Максимальный размер блока при чтении данных INSERT-а. Context context; /// Чтение из stdin для batch режима - ReadBufferFromFileDescriptor std_in; + ReadBufferFromFileDescriptor std_in {STDIN_FILENO}; /// Вывод в консоль - WriteBufferFromFileDescriptor std_out; + WriteBufferFromFileDescriptor std_out {STDOUT_FILENO}; BlockOutputStreamPtr block_std_out; String home_path; @@ -105,7 +111,7 @@ private: String history_file; /// Строк прочитано или записано. - size_t processed_rows; + size_t processed_rows = 0; /// Распарсенный запрос. Оттуда берутся некоторые настройки (формат). ASTPtr parsed_query; @@ -115,10 +121,10 @@ private: Stopwatch watch; - size_t rows_read_on_server; - size_t bytes_read_on_server; - size_t written_progress_chars; - bool written_first_block; + size_t rows_read_on_server = 0; + size_t bytes_read_on_server = 0; + size_t written_progress_chars = 0; + bool written_first_block = false; /// Информация о внешних таблицах std::list external_tables; @@ -755,12 +761,7 @@ private: void onData(Block & block) { if (written_progress_chars) - { - for (size_t i = 0; i < written_progress_chars; ++i) - std::cerr << "\b \b"; - - written_progress_chars = 0; - } + clearProgress(); if (!block) return; @@ -813,8 +814,18 @@ private: } + void clearProgress() + { + std::cerr << RESTORE_CURSOR_POSITION CLEAR_TO_END_OF_LINE; + written_progress_chars = 0; + } + + void writeProgress() { + if (!is_interactive) + return; + static size_t increment = 0; static const char * indicators[8] = { @@ -825,30 +836,30 @@ private: "\033[1;34m←\033[0m", "\033[1;35m↖\033[0m", "\033[1;36m↑\033[0m", - "\033[1;37m↗\033[0m", + "\033[1m↗\033[0m", }; - if (is_interactive) - { - std::cerr << std::string(written_progress_chars, '\b'); + if (written_progress_chars) + clearProgress(); + else + std::cerr << SAVE_CURSOR_POSITION; - std::stringstream message; - message << indicators[increment % 8] - << std::fixed << std::setprecision(3) - << " Progress: " << rows_read_on_server << " rows, " << bytes_read_on_server / 1000000.0 << " MB"; + std::stringstream message; + message << indicators[increment % 8] + << std::fixed << std::setprecision(3) + << " Progress: " << rows_read_on_server << " rows, " << bytes_read_on_server / 1000000.0 << " MB"; - size_t elapsed_ns = watch.elapsed(); - if (elapsed_ns) - message << " (" - << rows_read_on_server * 1000000000.0 / elapsed_ns << " rows/s., " - << bytes_read_on_server * 1000.0 / elapsed_ns << " MB/s.) "; - else - message << ". "; + size_t elapsed_ns = watch.elapsed(); + if (elapsed_ns) + message << " (" + << rows_read_on_server * 1000000000.0 / elapsed_ns << " rows/s., " + << bytes_read_on_server * 1000.0 / elapsed_ns << " MB/s.) "; + else + message << ". "; - written_progress_chars = message.str().size() - 13; - std::cerr << message.rdbuf(); - ++increment; - } + written_progress_chars = message.str().size() - 13; + std::cerr << DISABLE_LINE_WRAPPING << message.rdbuf() << ENABLE_LINE_WRAPPING; + ++increment; } diff --git a/dbms/src/DataStreams/PrettyBlockOutputStream.cpp b/dbms/src/DataStreams/PrettyBlockOutputStream.cpp index 01f4f86ce72..bbcbac06065 100644 --- a/dbms/src/DataStreams/PrettyBlockOutputStream.cpp +++ b/dbms/src/DataStreams/PrettyBlockOutputStream.cpp @@ -140,7 +140,7 @@ void PrettyBlockOutputStream::write(const Block & block_) const ColumnWithNameAndType & col = block.getByPosition(i); if (!no_escapes) - writeCString("\033[1;37m", ostr); + writeCString("\033[1m", ostr); if (col.type->isNumeric()) { diff --git a/dbms/src/DataStreams/PrettyCompactBlockOutputStream.cpp b/dbms/src/DataStreams/PrettyCompactBlockOutputStream.cpp index 4ac18c596e8..530a1ef12f2 100644 --- a/dbms/src/DataStreams/PrettyCompactBlockOutputStream.cpp +++ b/dbms/src/DataStreams/PrettyCompactBlockOutputStream.cpp @@ -29,7 +29,7 @@ void PrettyCompactBlockOutputStream::writeHeader( writeCString("─", ostr); if (!no_escapes) - writeCString("\033[1;37m", ostr); + writeCString("\033[1m", ostr); writeEscapedString(col.name, ostr); if (!no_escapes) writeCString("\033[0m", ostr); @@ -37,7 +37,7 @@ void PrettyCompactBlockOutputStream::writeHeader( else { if (!no_escapes) - writeCString("\033[1;37m", ostr); + writeCString("\033[1m", ostr); writeEscapedString(col.name, ostr); if (!no_escapes) writeCString("\033[0m", ostr); @@ -75,7 +75,7 @@ void PrettyCompactBlockOutputStream::writeRow( const Widths_t & name_widths) { size_t columns = max_widths.size(); - + writeCString("│ ", ostr); for (size_t j = 0; j < columns; ++j) @@ -90,7 +90,7 @@ void PrettyCompactBlockOutputStream::writeRow( size_t width = get((*block.getByPosition(columns + j).column)[row_id]); for (size_t k = 0; k < max_widths[j] - width; ++k) writeChar(' ', ostr); - + col.type->serializeTextEscaped((*col.column)[row_id], ostr); } else @@ -113,16 +113,16 @@ void PrettyCompactBlockOutputStream::write(const Block & block_) total_rows += block_.rows(); return; } - + /// Будем вставлять сюда столбцы с вычисленными значениями видимых длин. Block block = block_; - + size_t rows = block.rows(); Widths_t max_widths; Widths_t name_widths; calculateWidths(block, max_widths, name_widths); - + writeHeader(block, max_widths, name_widths); for (size_t i = 0; i < rows && total_rows + i < max_rows; ++i) diff --git a/dbms/src/DataStreams/PrettySpaceBlockOutputStream.cpp b/dbms/src/DataStreams/PrettySpaceBlockOutputStream.cpp index a26b0e95e39..8c92367eebd 100644 --- a/dbms/src/DataStreams/PrettySpaceBlockOutputStream.cpp +++ b/dbms/src/DataStreams/PrettySpaceBlockOutputStream.cpp @@ -17,10 +17,10 @@ void PrettySpaceBlockOutputStream::write(const Block & block_) total_rows += block_.rows(); return; } - + /// Будем вставлять суда столбцы с вычисленными значениями видимых длин. Block block = block_; - + size_t rows = block.rows(); size_t columns = block.columns(); @@ -48,7 +48,7 @@ void PrettySpaceBlockOutputStream::write(const Block & block_) writeChar(' ', ostr); if (!no_escapes) - writeCString("\033[1;37m", ostr); + writeCString("\033[1m", ostr); writeEscapedString(col.name, ostr); if (!no_escapes) writeCString("\033[0m", ostr); @@ -56,7 +56,7 @@ void PrettySpaceBlockOutputStream::write(const Block & block_) else { if (!no_escapes) - writeCString("\033[1;37m", ostr); + writeCString("\033[1m", ostr); writeEscapedString(col.name, ostr); if (!no_escapes) writeCString("\033[0m", ostr); @@ -81,7 +81,7 @@ void PrettySpaceBlockOutputStream::write(const Block & block_) size_t width = get((*block.getByPosition(columns + j).column)[i]); for (ssize_t k = 0; k < std::max(0L, static_cast(max_widths[j] - width)); ++k) writeChar(' ', ostr); - + col.type->serializeTextEscaped((*col.column)[i], ostr); } else diff --git a/dbms/src/Parsers/formatAST.cpp b/dbms/src/Parsers/formatAST.cpp index 050bff00caf..7b696b49510 100644 --- a/dbms/src/Parsers/formatAST.cpp +++ b/dbms/src/Parsers/formatAST.cpp @@ -19,7 +19,7 @@ namespace DB { -static const char * hilite_keyword = "\033[1;37m"; +static const char * hilite_keyword = "\033[1m"; static const char * hilite_identifier = "\033[0;36m"; static const char * hilite_function = "\033[0;33m"; static const char * hilite_operator = "\033[1;33m"; From f531960561646eeffa5682e9a1b50e7a900d76ca Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 Aug 2014 00:10:44 +0400 Subject: [PATCH 077/127] dbms: fixed visibility of unreplicated parts [#METR-12738]. --- dbms/src/Storages/StorageReplicatedMergeTree.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index a77059bcd7d..ec9d0410297 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -101,9 +101,13 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (Poco::File(unreplicated_path).exists()) { LOG_INFO(log, "Have unreplicated data"); + unreplicated_data.reset(new MergeTreeData(unreplicated_path, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_, index_granularity_, mode_, sign_column_, settings_, database_name_ + "." + table_name + "[unreplicated]", false)); + + unreplicated_data->loadDataParts(skip_sanity_checks); + unreplicated_reader.reset(new MergeTreeDataSelectExecutor(*unreplicated_data)); unreplicated_merger.reset(new MergeTreeDataMerger(*unreplicated_data)); } From 35cdf281101aad389cc1a2f84950ca331927ed61 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 03:18:37 +0400 Subject: [PATCH 078/127] dbms: improved performance of filtering [#METR-2944]. --- dbms/include/DB/Columns/ColumnConst.h | 17 ++--- dbms/include/DB/Columns/IColumn.h | 4 + dbms/src/Columns/IColumn.cpp | 45 +++++++++++ .../DataStreams/FilterBlockInputStream.cpp | 76 +++++++++++-------- 4 files changed, 99 insertions(+), 43 deletions(-) create mode 100644 dbms/src/Columns/IColumn.cpp diff --git a/dbms/include/DB/Columns/ColumnConst.h b/dbms/include/DB/Columns/ColumnConst.h index d6b642839ec..c44300537d6 100644 --- a/dbms/include/DB/Columns/ColumnConst.h +++ b/dbms/include/DB/Columns/ColumnConst.h @@ -32,11 +32,11 @@ class ColumnConst final : public IColumnConst public: typedef T Type; typedef typename NearestFieldType::Type FieldType; - + /// Для ColumnConst data_type_ должен быть ненулевым. /// Для ColumnConst data_type_ должен быть ненулевым, если тип данных FixedString. ColumnConst(size_t s_, const T & data_, DataTypePtr data_type_ = DataTypePtr()) : s(s_), data(data_), data_type(data_type_) {} - + std::string getName() const { return "ColumnConst<" + TypeName::get() + ">"; } bool isNumeric() const { return IsNumber::value; } bool isFixed() const { return IsNumber::value; } @@ -50,7 +50,7 @@ public: { return new ColumnConst(length, data, data_type); } - + void insert(const Field & x) { if (x.get() != FieldType(data)) @@ -71,20 +71,15 @@ public: ErrorCodes::CANNOT_INSERT_ELEMENT_INTO_CONSTANT_COLUMN); ++s; } - + void insertDefault() { ++s; } ColumnPtr filter(const Filter & filt) const { if (s != filt.size()) throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); - - size_t new_size = 0; - for (Filter::const_iterator it = filt.begin(); it != filt.end(); ++it) - if (*it) - ++new_size; - - return new ColumnConst(new_size, data, data_type); + + return new ColumnConst(countBytesInFilter(filt), data, data_type); } ColumnPtr replicate(const Offsets_t & offsets) const diff --git a/dbms/include/DB/Columns/IColumn.h b/dbms/include/DB/Columns/IColumn.h index cab7bf1a645..8a448e4bb01 100644 --- a/dbms/include/DB/Columns/IColumn.h +++ b/dbms/include/DB/Columns/IColumn.h @@ -198,4 +198,8 @@ public: }; +/// Считает, сколько байт в filt больше нуля. +size_t countBytesInFilter(const IColumn::Filter & filt); + + } diff --git a/dbms/src/Columns/IColumn.cpp b/dbms/src/Columns/IColumn.cpp new file mode 100644 index 00000000000..e348cf79a3b --- /dev/null +++ b/dbms/src/Columns/IColumn.cpp @@ -0,0 +1,45 @@ +#include + +#include + + +namespace DB +{ + +size_t countBytesInFilter(const IColumn::Filter & filt) +{ + size_t count = 0; + + /** NOTE: По идее, filt должен содержать только нолики и единички. + * Но, на всякий случай, здесь используется условие > 0 (на знаковые байты). + * Лучше было бы использовать != 0, то это не позволяет SSE2. + */ + + const __m128i zero16 = _mm_set1_epi8(0); + + const Int8 * pos = reinterpret_cast(&filt[0]); + const Int8 * end = pos + filt.size(); + const Int8 * end64 = pos + filt.size() / 64 * 64; + + for (; pos < end64; pos += 64) + count += __builtin_popcount( + static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast(pos)), + zero16))) + | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast(pos + 16)), + zero16))) << 16) + | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast(pos + 32)), + zero16))) << 32) + | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast(pos + 48)), + zero16))) << 48)); + + for (; pos < end; ++pos) + count += *pos > 0; + + return count; +} + +} diff --git a/dbms/src/DataStreams/FilterBlockInputStream.cpp b/dbms/src/DataStreams/FilterBlockInputStream.cpp index fef2e4566e6..3a36d42b710 100644 --- a/dbms/src/DataStreams/FilterBlockInputStream.cpp +++ b/dbms/src/DataStreams/FilterBlockInputStream.cpp @@ -41,7 +41,7 @@ Block FilterBlockInputStream::readImpl() /** Если фильтр - константа (например, написано WHERE 1), * то либо вернём пустой блок, либо вернём блок без изменений. */ - ColumnConstUInt8 * column_const = typeid_cast(&*column); + const ColumnConstUInt8 * column_const = typeid_cast(&*column); if (column_const) { if (!column_const->getData()) @@ -50,52 +50,64 @@ Block FilterBlockInputStream::readImpl() return res; } - ColumnUInt8 * column_vec = typeid_cast(&*column); + const ColumnUInt8 * column_vec = typeid_cast(&*column); if (!column_vec) throw Exception("Illegal type " + column->getName() + " of column for filter. Must be ColumnUInt8 or ColumnConstUInt8.", ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER); - IColumn::Filter & filter = column_vec->getData(); + const IColumn::Filter & filter = column_vec->getData(); - /// Если кроме столбца с фильтром ничего нет. - if (columns == 1) - { - /// То посчитаем в нём количество единичек. - size_t filtered_rows = 0; - for (size_t i = 0, size = filter.size(); i < size; ++i) - if (filter[i]) - ++filtered_rows; - - /// Если текущий блок полностью отфильтровался - перейдём к следующему. - if (filtered_rows == 0) - continue; - - /// Заменяем этот столбец на столбец с константой 1, нужного размера. - res.getByPosition(filter_column).column = new ColumnConstUInt8(filtered_rows, 1); - - return res; - } - - /// Общий случай - фильтруем остальные столбцы. + /** Выясним, сколько строк будет в результате. + * Для этого отфильтруем первый попавшийся неконстантный столбец + * или же посчитаем количество выставленных байт в фильтре. + */ + size_t first_non_constant_column = 0; for (size_t i = 0; i < columns; ++i) { - if (i != static_cast(filter_column)) + if (!res.getByPosition(i).column->isConst()) { - ColumnWithNameAndType & current_column = res.getByPosition(i); - current_column.column = current_column.column->filter(filter); - if (current_column.column->empty()) + first_non_constant_column = i; + + if (first_non_constant_column != static_cast(filter_column)) break; } } - /// Любой столбец - не являющийся фильтром. - IColumn & any_not_filter_column = *res.getByPosition(filter_column == 0 ? 1 : 0).column; + size_t filtered_rows = 0; + if (first_non_constant_column != static_cast(filter_column)) + { + ColumnWithNameAndType & current_column = res.getByPosition(first_non_constant_column); + current_column.column = current_column.column->filter(filter); + filtered_rows = current_column.column->size(); + } + else + { + filtered_rows = countBytesInFilter(filter); + } /// Если текущий блок полностью отфильтровался - перейдём к следующему. - if (any_not_filter_column.empty()) + if (filtered_rows == 0) continue; - /// Сам столбец с фильтром заменяем на столбец с константой 1, так как после фильтрации в нём ничего другого не останется. - res.getByPosition(filter_column).column = new ColumnConstUInt8(any_not_filter_column.size(), 1); + /// Фильтруем остальные столбцы. + for (size_t i = 0; i < columns; ++i) + { + ColumnWithNameAndType & current_column = res.getByPosition(i); + + if (i == static_cast(filter_column)) + { + /// Сам столбец с фильтром заменяем на столбец с константой 1, так как после фильтрации в нём ничего другого не останется. + current_column.column = new ColumnConstUInt8(filtered_rows, 1); + continue; + } + + if (i == first_non_constant_column) + continue; + + if (current_column.column->isConst()) + current_column.column = current_column.column->cut(0, filtered_rows); + else + current_column.column = current_column.column->filter(filter); + } return res; } From cdf4e8d4156964fc19c8084893e7be1272aa0a97 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 06:13:40 +0400 Subject: [PATCH 079/127] dbms: improved performance of integer division by constant [#METR-2944]. --- .../DB/Functions/FunctionsArithmetic.h | 143 +- libs/libdivide/libdivide.h | 1332 +++++++++++++++++ 2 files changed, 1474 insertions(+), 1 deletion(-) create mode 100644 libs/libdivide/libdivide.h diff --git a/dbms/include/DB/Functions/FunctionsArithmetic.h b/dbms/include/DB/Functions/FunctionsArithmetic.h index 45c6834f316..eed06e4135a 100644 --- a/dbms/include/DB/Functions/FunctionsArithmetic.h +++ b/dbms/include/DB/Functions/FunctionsArithmetic.h @@ -14,7 +14,7 @@ namespace DB */ template -struct BinaryOperationImpl +struct BinaryOperationImplBase { typedef typename Op::ResultType ResultType; @@ -45,6 +45,11 @@ struct BinaryOperationImpl } }; +template +struct BinaryOperationImpl : BinaryOperationImplBase +{ +}; + template struct UnaryOperationImpl { @@ -570,4 +575,140 @@ typedef FunctionBinaryArithmetic Functi +/// Оптимизации для целочисленного деления на константу. + +#define LIBDIVIDE_USE_SSE2 1 +#include + + +template +struct DivideIntegralByConstantImpl + : BinaryOperationImplBase> +{ + typedef typename DivideIntegralImpl::ResultType ResultType; + + static void vector_constant(const PODArray & a, B b, PODArray & c) + { + if (unlikely(b == 0)) + throw Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION); + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" + + if (unlikely(std::is_signed::value && b == -1)) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = -c[i]; + return; + } + +#pragma GCC diagnostic pop + + libdivide::divider divider(b); + + size_t size = a.size(); + const A * a_pos = &a[0]; + const A * a_end = a_pos + size; + ResultType * c_pos = &c[0]; + static constexpr size_t values_per_sse_register = 16 / sizeof(A); + const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register; + + while (a_pos < a_end_sse) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(c_pos), + _mm_loadu_si128(reinterpret_cast(a_pos)) / divider); + + a_pos += values_per_sse_register; + c_pos += values_per_sse_register; + } + + while (a_pos < a_end) + { + *c_pos = *a_pos / divider; + ++a_pos; + ++c_pos; + } + } +}; + +template +struct ModuloByConstantImpl + : BinaryOperationImplBase> +{ + typedef typename ModuloImpl::ResultType ResultType; + + static void vector_constant(const PODArray & a, B b, PODArray & c) + { + if (unlikely(b == 0)) + throw Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION); + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" + + if (unlikely((std::is_signed::value && b == -1) || b == 1)) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = 0; + return; + } + +#pragma GCC diagnostic pop + + libdivide::divider divider(b); + + /// Тут не удалось сделать так, чтобы SSE вариант из libdivide давал преимущество. + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = a[i] - (a[i] / divider) * b; /// NOTE: возможно, не сохраняется семантика деления с остатком отрицательных чисел. + } +}; + + +/** Прописаны специализации для деления чисел типа UInt64 и UInt32 на числа той же знаковости. + * Можно дополнить до всех возможных комбинаций, но потребуется больше кода. + */ + +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; + +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; + +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; + +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; +template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; + + +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; + +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; + +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; + +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; +template <> struct BinaryOperationImpl> : ModuloByConstantImpl {}; + } diff --git a/libs/libdivide/libdivide.h b/libs/libdivide/libdivide.h new file mode 100644 index 00000000000..52b7d5638d3 --- /dev/null +++ b/libs/libdivide/libdivide.h @@ -0,0 +1,1332 @@ +/* libdivide.h + Copyright 2010 ridiculous_fish +*/ + +#if defined(_WIN32) || defined(WIN32) +#define LIBDIVIDE_WINDOWS 1 +#endif + +#if defined(_MSC_VER) +#define LIBDIVIDE_VC 1 +#endif + +#ifdef __cplusplus +#include +#include +#include +#else +#include +#include +#include +#endif + +#if ! LIBDIVIDE_HAS_STDINT_TYPES && (! LIBDIVIDE_VC || _MSC_VER >= 1600) +/* Only Visual C++ 2010 and later include stdint.h */ +#include +#define LIBDIVIDE_HAS_STDINT_TYPES 1 +#endif + +#if ! LIBDIVIDE_HAS_STDINT_TYPES +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +typedef __int8 int8_t; +typedef unsigned __int8 uint8_t; +#endif + +#if LIBDIVIDE_USE_SSE2 + #include +#endif + +#if LIBDIVIDE_VC + #include +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif + +#ifdef __ICC +#define HAS_INT128_T 0 +#else +#define HAS_INT128_T __LP64__ +#endif + +#if defined(__x86_64__) || defined(_WIN64) || defined(_M_64) +#define LIBDIVIDE_IS_X86_64 1 +#endif + +#if defined(__i386__) +#define LIBDIVIDE_IS_i386 1 +#endif + +#if __GNUC__ || __clang__ +#define LIBDIVIDE_GCC_STYLE_ASM 1 +#endif + + +/* libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) which is in SSE 4.1. However, signed multiplication can be emulated efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so it is OK to not turn this on */ +#ifdef LIBDIVIDE_USE_SSE4_1 +#include +#endif + +#ifdef __cplusplus +/* We place libdivide within the libdivide namespace, and that goes in an anonymous namespace so that the functions are only visible to files that #include this header and don't get external linkage. At least that's the theory. */ +namespace { +namespace libdivide { +#endif + +/* Explanation of "more" field: bit 6 is whether to use shift path. If we are using the shift path, bit 7 is whether the divisor is negative in the signed case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the "negative divisor indicator" so that we can use sign extension to efficiently go to a full-width -1. + + +u32: [0-4] shift value + [5] ignored + [6] add indicator + [7] shift path + +s32: [0-4] shift value + [5] shift path + [6] add indicator + [7] indicates negative divisor + +u64: [0-5] shift value + [6] add indicator + [7] shift path + +s64: [0-5] shift value + [6] add indicator + [7] indicates negative divisor + magic number of 0 indicates shift path (we ran out of bits!) +*/ + +enum { + LIBDIVIDE_32_SHIFT_MASK = 0x1F, + LIBDIVIDE_64_SHIFT_MASK = 0x3F, + LIBDIVIDE_ADD_MARKER = 0x40, + LIBDIVIDE_U32_SHIFT_PATH = 0x80, + LIBDIVIDE_U64_SHIFT_PATH = 0x80, + LIBDIVIDE_S32_SHIFT_PATH = 0x20, + LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 +}; + + +struct libdivide_u32_t { + uint32_t magic; + uint8_t more; +}; + +struct libdivide_s32_t { + int32_t magic; + uint8_t more; +}; + +struct libdivide_u64_t { + uint64_t magic; + uint8_t more; +}; + +struct libdivide_s64_t { + int64_t magic; + uint8_t more; +}; + + + +#ifndef LIBDIVIDE_API + #ifdef __cplusplus + /* In C++, we don't want our public functions to be static, because they are arguments to templates and static functions can't do that. They get internal linkage through virtue of the anonymous namespace. In C, they should be static. */ + #define LIBDIVIDE_API + #else + #define LIBDIVIDE_API static + #endif +#endif + + +LIBDIVIDE_API struct libdivide_s32_t libdivide_s32_gen(int32_t y); +LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y); +LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(int64_t y); +LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(uint64_t y); + +LIBDIVIDE_API int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); +LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); +LIBDIVIDE_API uint64_t libdivide_u64_do(uint64_t y, const struct libdivide_u64_t *denom); + +LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom); +LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom); +LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom); +LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom); + +LIBDIVIDE_API int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom); +LIBDIVIDE_API uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom); +LIBDIVIDE_API uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom); +LIBDIVIDE_API uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom); + +LIBDIVIDE_API int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom); +LIBDIVIDE_API int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom); +LIBDIVIDE_API int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom); +LIBDIVIDE_API int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom); +LIBDIVIDE_API int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom); +LIBDIVIDE_API int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom); + +LIBDIVIDE_API int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom); +LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom); + +#if LIBDIVIDE_USE_SSE2 +LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t * denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom); + +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t * denom); +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t * denom); +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t * denom); + +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t * denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t * denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t * denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t * denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t * denom); + +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t * denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t * denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t * denom); + +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t * denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t * denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t * denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t * denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t * denom); +#endif + + + +//////// Internal Utility Functions + +static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) { + uint64_t xl = x, yl = y; + uint64_t rl = xl * yl; + return (uint32_t)(rl >> 32); +} + +static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { +#if HAS_INT128_T + __uint128_t xl = x, yl = y; + __uint128_t rl = xl * yl; + return (uint64_t)(rl >> 64); +#else + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const uint32_t mask = 0xFFFFFFFF; + const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> 32); + const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> 32); + const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + const uint64_t x0y1 = x0 * (uint64_t)y1; + const uint64_t x1y0 = x1 * (uint64_t)y0; + const uint64_t x1y1 = x1 * (uint64_t)y1; + + uint64_t temp = x1y0 + x0y0_hi; + uint64_t temp_lo = temp & mask, temp_hi = temp >> 32; + return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); +#endif +} + +static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { +#if HAS_INT128_T + __int128_t xl = x, yl = y; + __int128_t rl = xl * yl; + return (int64_t)(rl >> 64); +#else + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const uint32_t mask = 0xFFFFFFFF; + const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask); + const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32); + const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + const int64_t t = x1*(int64_t)y0 + x0y0_hi; + const int64_t w1 = x0*(int64_t)y1 + (t & mask); + return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32); +#endif +} + +#if LIBDIVIDE_USE_SSE2 + +static inline __m128i libdivide__u64_to_m128(uint64_t x) { +#if LIBDIVIDE_VC && ! _WIN64 + //64 bit windows doesn't seem to have an implementation of any of these load intrinsics, and 32 bit Visual C++ crashes + _declspec(align(16)) uint64_t temp[2] = {x, x}; + return _mm_load_si128((const __m128i*)temp); +#elif defined(__ICC) + uint64_t __attribute__((aligned(16))) temp[2] = {x,x}; + return _mm_load_si128((const __m128i*)temp); +#elif __clang__ + // clang does not provide this intrinsic either + return (__m128i){x, x}; +#else + // everyone else gets it right + return _mm_set1_epi64x(x); +#endif +} + +static inline __m128i libdivide_get_FFFFFFFF00000000(void) { + //returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) without touching memory + __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X + return _mm_slli_epi64(result, 32); +} + +static inline __m128i libdivide_get_00000000FFFFFFFF(void) { + //returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) without touching memory + __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X + result = _mm_srli_epi64(result, 32); + return result; +} + +static inline __m128i libdivide_get_0000FFFF(void) { + //returns the same as _mm_set1_epi32(0x0000FFFFULL) without touching memory + __m128i result; //we don't care what its contents are + result = _mm_cmpeq_epi8(result, result); //all 1s + result = _mm_srli_epi32(result, 16); + return result; +} + +static inline __m128i libdivide_s64_signbits(__m128i v) { + //we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there is no 64 bit shift right arithmetic instruction in SSE2. So we have to fake it by first duplicating the high 32 bit values, and then using a 32 bit shift. Another option would be to use _mm_srli_epi64(v, 63) and then subtract that from 0, but that approach appears to be substantially slower for unknown reasons + __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +/* Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. */ +static inline __m128i libdivide_u32_to_m128i(uint32_t amt) { + return _mm_set_epi32(0, 0, 0, amt); +} + +static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { + //implementation of _mm_sra_epi64. Here we have two 64 bit values which are shifted right to logically become (64 - amt) values, and are then sign extended from a (64 - amt) bit number. + const int b = 64 - amt; + __m128i m = libdivide__u64_to_m128(1ULL << (b - 1)); + __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt)); + __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); //result = x^m - m + return result; +} + +/* Here, b is assumed to contain one 32 bit value repeated four times. If it did not, the function would not work. */ +static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000()); + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123 +} + + +/* Here, y is assumed to contain one 64 bit value repeated twice. */ +static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) { + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const __m128i mask = libdivide_get_00000000FFFFFFFF(); + const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, 32); //x0 is low half of 2 64 bit values, x1 is high half in low slots + const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, 32); + const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); //x0 happens to have the low half of the two 64 bit values in 32 bit slots 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift right by 32 to get just the high values + const __m128i x0y1 = _mm_mul_epu32(x0, y1); + const __m128i x1y0 = _mm_mul_epu32(x1, y0); + const __m128i x1y1 = _mm_mul_epu32(x1, y1); + + const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, 32); + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + + return _mm_add_epi64(temp_lo, temp_hi); +} + +/* y is one 64 bit value repeated twice */ +static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_flat_vector(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); + p = _mm_sub_epi64(p, t1); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); + p = _mm_sub_epi64(p, t2); + return p; +} + +#ifdef LIBDIVIDE_USE_SSE4_1 + +/* b is one 32 bit value repeated four times. */ +static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), libdivide_get_FFFFFFFF00000000()); + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123 +} + +#else + +/* SSE2 does not have a signed multiplication instruction, but we can convert unsigned to signed pretty efficiently. Again, b is just a 32 bit value repeated four times. */ +static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { + __m128i p = libdivide__mullhi_u32_flat_vector(a, b); + __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); //t1 = (a >> 31) & y, arithmetic shift + __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); + p = _mm_sub_epi32(p, t1); + p = _mm_sub_epi32(p, t2); + return p; +} +#endif +#endif + +static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) { +#if __GNUC__ || __has_builtin(__builtin_ctz) + /* Fast way to count trailing zeros */ + return __builtin_ctz(val); +#elif LIBDIVIDE_VC + unsigned long result; + if (_BitScanForward(&result, val)) { + return result; + } + return 0; +#else + /* Dorky way to count trailing zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest + while (val) { + val >>= 1; + result++; + } + return result; +#endif +} + +static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) { +#if __LP64__ && (__GNUC__ || __has_builtin(__builtin_ctzll)) + /* Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - it calls through to a dynamically bound function. */ + return __builtin_ctzll(val); +#elif LIBDIVIDE_VC && _WIN64 + unsigned long result; + if (_BitScanForward64(&result, val)) { + return result; + } + return 0; +#else + /* Pretty good way to count trailing zeros. Note that this hangs for val = 0! */ + uint32_t lo = val & 0xFFFFFFFF; + if (lo != 0) return libdivide__count_trailing_zeros32(lo); + return 32 + libdivide__count_trailing_zeros32(val >> 32); +#endif +} + +static inline int32_t libdivide__count_leading_zeros32(uint32_t val) { +#if __GNUC__ || __has_builtin(__builtin_clzll) + /* Fast way to count leading zeros */ + return __builtin_clz(val); +#elif LIBDIVIDE_VC + unsigned long result; + if (_BitScanReverse(&result, val)) { + return 31 - result; + } + return 0; +#else + /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + while (! (val & (1U << 31))) { + val <<= 1; + result++; + } + return result; +#endif +} + +static inline int32_t libdivide__count_leading_zeros64(uint64_t val) { +#if __GNUC__ || __has_builtin(__builtin_clzll) + /* Fast way to count leading zeros */ + return __builtin_clzll(val); +#elif LIBDIVIDE_VC && _WIN64 + unsigned long result; + if (_BitScanReverse64(&result, val)) { + return 63 - result; + } + return 0; +#else + /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + while (! (val & (1ULL << 63))) { + val <<= 1; + result++; + } + return result; +#endif +} + +//libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. Returns the quotient directly and the remainder in *r +#if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM +static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { + uint32_t result; + __asm__("divl %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; +} +#else +static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { + uint64_t n = (((uint64_t)u1) << 32) | u0; + uint32_t result = (uint32_t)(n / v); + *r = (uint32_t)(n - result * (uint64_t)v); + return result; +} +#endif + +#if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM +static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { + //u0 -> rax + //u1 -> rdx + //divq + uint64_t result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; + +} +#else + +/* Code taken from Hacker's Delight, http://www.hackersdelight.org/HDcode/divlu.c . License permits inclusion here per http://www.hackersdelight.org/permissions.htm + */ +static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { + const uint64_t b = (1ULL << 32); // Number base (16 bits). + uint64_t un1, un0, // Norm. dividend LSD's. + vn1, vn0, // Norm. divisor digits. + q1, q0, // Quotient digits. + un64, un21, un10,// Dividend digit pairs. + rhat; // A remainder. + int s; // Shift amount for norm. + + if (u1 >= v) { // If overflow, set rem. + if (r != NULL) // to an impossible value, + *r = (uint64_t)(-1); // and return the largest + return (uint64_t)(-1);} // possible quotient. + + /* count leading zeros */ + s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63. + if (s > 0) { + v = v << s; // Normalize divisor. + un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un10 = u0 << s; // Shift dividend left. + } else { + // Avoid undefined behavior. + un64 = u1 | u0; + un10 = u0; + } + + vn1 = v >> 32; // Break divisor up into + vn0 = v & 0xFFFFFFFF; // two 32-bit digits. + + un1 = un10 >> 32; // Break right half of + un0 = un10 & 0xFFFFFFFF; // dividend into two digits. + + q1 = un64/vn1; // Compute the first + rhat = un64 - q1*vn1; // quotient digit, q1. +again1: + if (q1 >= b || q1*vn0 > b*rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat < b) goto again1;} + + un21 = un64*b + un1 - q1*v; // Multiply and subtract. + + q0 = un21/vn1; // Compute the second + rhat = un21 - q0*vn1; // quotient digit, q0. +again2: + if (q0 >= b || q0*vn0 > b*rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat < b) goto again2;} + + if (r != NULL) // If remainder is wanted, + *r = (un21*b + un0 - q0*v) >> s; // return it. + return q1*b + q0; +} +#endif + +#if LIBDIVIDE_ASSERTIONS_ON +#define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0) +#else +#define LIBDIVIDE_ASSERT(x) +#endif + +#ifndef LIBDIVIDE_HEADER_ONLY + +////////// UINT32 + +struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { + struct libdivide_u32_t result; + if ((d & (d - 1)) == 0) { + result.magic = 0; + result.more = libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH; + } + else { + const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d); + + uint8_t more; + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint32_t e = d - rem; + + /* This power works if e < 2**floor_log_2_d. */ + if (e < (1U << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d; + } + else { + /* We have to use the general 33-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */ + proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it + const uint32_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases. + + } + return result; +} + +uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) { + return numer >> (more & LIBDIVIDE_32_SHIFT_MASK); + } + else { + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint32_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_32_SHIFT_MASK); + } + else { + return q >> more; //all upper bits are 0 - don't need to mask them off + } + } +} + + +int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0; + else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1; + else return 2; +} + +uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) { + return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK); +} + +uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) { + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + return q >> denom->more; +} + +uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) { + // denom->add != 0 + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t t = ((numer - q) >> 1) + q; + return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK); +} + + + + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) { + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + } + else { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + //uint32_t t = ((numer - q) >> 1) + q; + //return t >> denom->shift; + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + + } + else { + //q >> denom->shift + return _mm_srl_epi32(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) { + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +#endif + +/////////// UINT64 + +struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { + struct libdivide_u64_t result; + if ((d & (d - 1)) == 0) { + result.more = libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH; + result.magic = 0; + } + else { + const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d); + + uint64_t proposed_m, rem; + uint8_t more; + proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint64_t e = d - rem; + + /* This power works if e < 2**floor_log_2_d. */ + if (e < (1ULL << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d; + } + else { + /* We have to use the general 65-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */ + proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it + const uint64_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases, which is why we do it outside of the if statement. + } + return result; +} + +uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) { + return numer >> (more & LIBDIVIDE_64_SHIFT_MASK); + } + else { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint64_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_64_SHIFT_MASK); + } + else { + return q >> more; //all upper bits are 0 - don't need to mask them off + } + } +} + + +int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) return 0; + else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1; + else return 2; +} + +uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) { + return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK); +} + +uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + return q >> denom->more; +} + +uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t t = ((numer - q) >> 1) + q; + return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK); +} + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) { + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); + } + else { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + //uint32_t t = ((numer - q) >> 1) + q; + //return t >> denom->shift; + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); + } + else { + //q >> denom->shift + return _mm_srl_epi64(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) { + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + +__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + + +#endif + +/////////// SINT32 + + +static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) { + int64_t xl = x, yl = y; + int64_t rl = xl * yl; + return (int32_t)(rl >> 32); //needs to be arithmetic shift +} + +struct libdivide_s32_t libdivide_s32_gen(int32_t d) { + struct libdivide_s32_t result; + + /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */ + uint32_t absD = (uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick + if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero + result.magic = 0; + result.more = libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH; + } + else { + const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD); + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1 + uint32_t rem, proposed_m; + proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); + const uint32_t e = absD - rem; + + /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ + if (e < (1U << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d - 1; + } + else { + /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */ + proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); //use the general algorithm + } + proposed_m += 1; + result.magic = (d < 0 ? -(int32_t)proposed_m : (int32_t)proposed_m); + result.more = more; + + } + return result; +} + +int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_S32_SHIFT_PATH) { + uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); + q = q >> shifter; + int32_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend + q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + int32_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend + q += ((numer ^ sign) - sign); + } + q >>= more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; + } +} + +int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR); + if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? 0 : 1); + else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); + else return 4; +} + +int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); + return q >> shifter; +} + +int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); + return - (q >> shifter); +} + +int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q += numer; + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q -= numer; + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_S32_SHIFT_PATH) { + uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); //could use _mm_srli_epi32 with an all -1 register + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter + __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //set all bits of shift mask = to the sign bit of more + q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); //must be arithmetic shift + q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); + } + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); +} + +__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter))); +} + +__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_add_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sub_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more)); //q >>= shift + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; +} +#endif + +///////////// SINT64 + + +struct libdivide_s64_t libdivide_s64_gen(int64_t d) { + struct libdivide_s64_t result; + + /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */ + const uint64_t absD = (uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick + if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero + result.more = libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.magic = 0; + } + else { + const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD); + + //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1 + uint8_t more; + uint64_t rem, proposed_m; + proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); + const uint64_t e = absD - rem; + + /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ + if (e < (1ULL << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d - 1; + } + else { + /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */ + proposed_m += proposed_m; + const uint64_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + } + proposed_m += 1; + result.more = more; + result.magic = (d < 0 ? -(int64_t)proposed_m : (int64_t)proposed_m); + } + return result; +} + +int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { //shift path + uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1)); + q = q >> shifter; + int64_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend + q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + int64_t q = libdivide__mullhi_s64(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + int64_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend + q += ((numer ^ sign) - sign); + } + q >>= more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; + } +} + + +int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR); + if (denom->magic == 0) return (positiveDivisor ? 0 : 1); //shift path + else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); + else return 4; +} + +int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1)); + return q >> shifter; +} + +int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) { + //denom->shifter != -1 && demo->shiftMask != 0 + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1)); + return - (q >> shifter); +} + +int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q += numer; + q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; +} + +int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q -= numer; + q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; +} + +int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q >>= denom->more; + q += (q < 0); + return q; +} + + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { //shift path + uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); //q = numer + ((numer >> 63) & roundToZeroTweak); + q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter + __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); + q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //must be arithmetic shift + q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); + } + q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); //q >>= denom->mult_path.shift + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shifter); + return q; +} + +__m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shifter); + return _mm_sub_epi64(_mm_setzero_si128(), q); +} + +__m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = _mm_add_epi64(q, numers); + q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; +} + +__m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = _mm_sub_epi64(q, numers); + q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; +} + +__m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = libdivide_s64_shift_right_vector(q, denom->more); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); + return q; +} + +#endif + +/////////// C++ stuff + +#ifdef __cplusplus + +/* The C++ template design here is a total mess. This needs to be fixed by someone better at templates than I. The current design is: + +- The base is a template divider_base that takes the integer type, the libdivide struct, a generating function, a get algorithm function, a do function, and either a do vector function or a dummy int. +- The base has storage for the libdivide struct. This is the only storage (so the C++ class should be no larger than the libdivide struct). + +- Above that, there's divider_mid. This is an empty struct by default, but it is specialized against our four int types. divider_mid contains a template struct algo, that contains a typedef for a specialization of divider_base. struct algo is specialized to take an "algorithm number," where -1 means to use the general algorithm. + +- Publicly we have class divider, which inherits from divider_mid::algo. This also take an algorithm number, which defaults to -1 (the general algorithm). +- divider has a operator / which allows you to use a divider as the divisor in a quotient expression. + +*/ + +namespace libdivide_internal { + +#if LIBDIVIDE_USE_SSE2 +#define MAYBE_VECTOR(x) x +#define MAYBE_VECTOR_PARAM __m128i vector_func(__m128i, const DenomType *) +#else +#define MAYBE_VECTOR(x) 0 +#define MAYBE_VECTOR_PARAM int vector_func +#endif + + /* Some bogus unswitch functions for unsigned types so the same (presumably templated) code can work for both signed and unsigned. */ + uint32_t crash_u32(uint32_t, const libdivide_u32_t *) { abort(); return *(uint32_t *)NULL; } + uint64_t crash_u64(uint64_t, const libdivide_u64_t *) { abort(); return *(uint64_t *)NULL; } +#if LIBDIVIDE_USE_SSE2 + __m128i crash_u32_vector(__m128i, const libdivide_u32_t *) { abort(); return *(__m128i *)NULL; } + __m128i crash_u64_vector(__m128i, const libdivide_u64_t *) { abort(); return *(__m128i *)NULL; } +#endif + + template + class divider_base { + public: + DenomType denom; + divider_base(IntType d) : denom(gen_func(d)) { } + divider_base(const DenomType & d) : denom(d) { } + + IntType perform_divide(IntType val) const { return do_func(val, &denom); } +#if LIBDIVIDE_USE_SSE2 + __m128i perform_divide_vector(__m128i val) const { return vector_func(val, &denom); } +#endif + + int get_algorithm() const { return get_algo(&denom); } + }; + + + template struct divider_mid { }; + + template<> struct divider_mid { + typedef uint32_t IntType; + typedef struct libdivide_u32_t DenomType; + template struct denom { + typedef divider_base divider; + }; + + template struct algo { }; + template struct algo<-1, J> { typedef denom::divider divider; }; + template struct algo<0, J> { typedef denom::divider divider; }; + template struct algo<1, J> { typedef denom::divider divider; }; + template struct algo<2, J> { typedef denom::divider divider; }; + + /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */ + template struct algo<3, J> { typedef denom::divider divider; }; + template struct algo<4, J> { typedef denom::divider divider; }; + + }; + + template<> struct divider_mid { + typedef int32_t IntType; + typedef struct libdivide_s32_t DenomType; + template struct denom { + typedef divider_base divider; + }; + + + template struct algo { }; + template struct algo<-1, J> { typedef denom::divider divider; }; + template struct algo<0, J> { typedef denom::divider divider; }; + template struct algo<1, J> { typedef denom::divider divider; }; + template struct algo<2, J> { typedef denom::divider divider; }; + template struct algo<3, J> { typedef denom::divider divider; }; + template struct algo<4, J> { typedef denom::divider divider; }; + + }; + + template<> struct divider_mid { + typedef uint64_t IntType; + typedef struct libdivide_u64_t DenomType; + template struct denom { + typedef divider_base divider; + }; + + template struct algo { }; + template struct algo<-1, J> { typedef denom::divider divider; }; + template struct algo<0, J> { typedef denom::divider divider; }; + template struct algo<1, J> { typedef denom::divider divider; }; + template struct algo<2, J> { typedef denom::divider divider; }; + + /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */ + template struct algo<3, J> { typedef denom::divider divider; }; + template struct algo<4, J> { typedef denom::divider divider; }; + + + }; + + template<> struct divider_mid { + typedef int64_t IntType; + typedef struct libdivide_s64_t DenomType; + template struct denom { + typedef divider_base divider; + }; + + template struct algo { }; + template struct algo<-1, J> { typedef denom::divider divider; }; + template struct algo<0, J> { typedef denom::divider divider; }; + template struct algo<1, J> { typedef denom::divider divider; }; + template struct algo<2, J> { typedef denom::divider divider; }; + template struct algo<3, J> { typedef denom::divider divider; }; + template struct algo<4, J> { typedef denom::divider divider; }; + }; + +} + +template +class divider +{ + private: + typename libdivide_internal::divider_mid::template algo::divider sub; + template friend divider unswitch(const divider & d); + divider(const typename libdivide_internal::divider_mid::DenomType & denom) : sub(denom) { } + + public: + + /* Ordinary constructor, that takes the divisor as a parameter. */ + divider(T n) : sub(n) { } + + /* Default constructor, that divides by 1 */ + divider() : sub(1) { } + + /* Divides the parameter by the divisor, returning the quotient */ + T perform_divide(T val) const { return sub.perform_divide(val); } + +#if LIBDIVIDE_USE_SSE2 + /* Treats the vector as either two or four packed values (depending on the size), and divides each of them by the divisor, returning the packed quotients. */ + __m128i perform_divide_vector(__m128i val) const { return sub.perform_divide_vector(val); } +#endif + + /* Returns the index of algorithm, for use in the unswitch function */ + int get_algorithm() const { return sub.get_algorithm(); } // returns the algorithm for unswitching + + /* operator== */ + bool operator==(const divider & him) const { return sub.denom.magic == him.sub.denom.magic && sub.denom.more == him.sub.denom.more; } + + bool operator!=(const divider & him) const { return ! (*this == him); } +}; + +/* Returns a divider specialized for the given algorithm. */ +template +divider unswitch(const divider & d) { return divider(d.sub.denom); } + +/* Overload of the / operator for scalar division. */ +template +int_type operator/(int_type numer, const divider & denom) { + return denom.perform_divide(numer); +} + +#if LIBDIVIDE_USE_SSE2 +/* Overload of the / operator for vector division. */ +template +__m128i operator/(__m128i numer, const divider & denom) { + return denom.perform_divide_vector(numer); +} +#endif + + +#endif //__cplusplus + +#endif //LIBDIVIDE_HEADER_ONLY +#ifdef __cplusplus +} //close namespace libdivide +} //close anonymous namespace +#endif From 435394242ca5b08d179be16a6bfe966a8d243764 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 06:17:23 +0400 Subject: [PATCH 080/127] dbms: improved performance of filtering [#METR-2944]. --- dbms/src/DataStreams/FilterBlockInputStream.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dbms/src/DataStreams/FilterBlockInputStream.cpp b/dbms/src/DataStreams/FilterBlockInputStream.cpp index 3a36d42b710..75de7654b96 100644 --- a/dbms/src/DataStreams/FilterBlockInputStream.cpp +++ b/dbms/src/DataStreams/FilterBlockInputStream.cpp @@ -88,6 +88,15 @@ Block FilterBlockInputStream::readImpl() if (filtered_rows == 0) continue; + /// Если через фильтр проходят все строчки. + if (filtered_rows == filter.size()) + { + /// Заменим столбец с фильтром на константу. + res.getByPosition(filter_column).column = new ColumnConstUInt8(filtered_rows, 1); + /// Остальные столбцы трогать не нужно. + return res; + } + /// Фильтруем остальные столбцы. for (size_t i = 0; i < columns; ++i) { From 47a810e04fc654527e1456565163c5e1d4347bd6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 07:29:56 +0400 Subject: [PATCH 081/127] dbms: FunctionsComparison: improvement [#METR-2944]. --- .../DB/Functions/FunctionsComparison.h | 1014 ++--------------- 1 file changed, 123 insertions(+), 891 deletions(-) diff --git a/dbms/include/DB/Functions/FunctionsComparison.h b/dbms/include/DB/Functions/FunctionsComparison.h index 52fac72f64c..f91e1e966eb 100644 --- a/dbms/include/DB/Functions/FunctionsComparison.h +++ b/dbms/include/DB/Functions/FunctionsComparison.h @@ -34,37 +34,50 @@ namespace DB #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" -template -struct EqualsNumImpl +template struct EqualsOp { static UInt8 apply(A a, B b) { return a == b; } }; +template struct NotEqualsOp { static UInt8 apply(A a, B b) { return a != b; } }; +template struct LessOp { static UInt8 apply(A a, B b) { return a < b; } }; +template struct GreaterOp { static UInt8 apply(A a, B b) { return a > b; } }; +template struct LessOrEqualsOp { static UInt8 apply(A a, B b) { return a <= b; } }; +template struct GreaterOrEqualsOp { static UInt8 apply(A a, B b) { return a >= b; } }; + +#pragma GCC diagnostic pop + + + +template +struct NumComparisonImpl { static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) - c[i] = a[i] == b[i]; + c[i] = Op::apply(a[i], b[i]); } static void vector_constant(const PODArray & a, B b, PODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) - c[i] = a[i] == b; + c[i] = Op::apply(a[i], b); } static void constant_vector(A a, const PODArray & b, PODArray & c) { size_t size = b.size(); for (size_t i = 0; i < size; ++i) - c[i] = a == b[i]; + c[i] = Op::apply(a, b[i]); } static void constant_constant(A a, B b, UInt8 & c) { - c = a == b; + c = Op::apply(a, b); } }; -struct EqualsStringImpl + +template +struct StringComparisonImpl { static void string_vector_string_vector( const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, @@ -73,10 +86,18 @@ struct EqualsStringImpl { size_t size = a_offsets.size(); for (size_t i = 0; i < size; ++i) - c[i] = (i == 0) - ? (a_offsets[0] == b_offsets[0] && !memcmp(&a_data[0], &b_data[0], a_offsets[0] - 1)) - : (a_offsets[i] - a_offsets[i - 1] == b_offsets[i] - b_offsets[i - 1] - && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], a_offsets[i] - a_offsets[i - 1] - 1)); + { + if (i == 0) + { + /// Завершающий ноль в меньшей по длине строке входит в сравнение. + c[i] = Op::apply(memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0], b_offsets[0])), 0); + } + else + { + c[i] = Op::apply(memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], + std::min(a_offsets[i] - a_offsets[i - 1], b_offsets[i] - b_offsets[i - 1])), 0); + } + } } static void string_vector_fixed_string_vector( @@ -86,10 +107,19 @@ struct EqualsStringImpl { size_t size = a_offsets.size(); for (size_t i = 0; i < size; ++i) - c[i] = (i == 0) - ? (a_offsets[0] == b_n + 1 && !memcmp(&a_data[0], &b_data[0], b_n)) - : (a_offsets[i] - a_offsets[i - 1] == b_n + 1 - && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_n * i], b_n)); + { + if (i == 0) + { + int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0] - 1, b_n)); + c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[0], b_n + 1)); + } + else + { + int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n], + std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); + c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[i] - a_offsets[i - 1], b_n + 1)); + } + } } static void string_vector_constant( @@ -101,10 +131,17 @@ struct EqualsStringImpl ColumnString::Offset_t b_n = b.size(); const UInt8 * b_data = reinterpret_cast(b.data()); for (size_t i = 0; i < size; ++i) - c[i] = (i == 0) - ? (a_offsets[0] == b_n + 1 && !memcmp(&a_data[0], b_data, b_n)) - : (a_offsets[i] - a_offsets[i - 1] == b_n + 1 - && !memcmp(&a_data[a_offsets[i - 1]], b_data, b_n)); + { + if (i == 0) + { + c[i] = Op::apply(memcmp(&a_data[0], b_data, std::min(a_offsets[0], b_n + 1)), 0); + } + else + { + c[i] = Op::apply(memcmp(&a_data[a_offsets[i - 1]], b_data, + std::min(a_offsets[i] - a_offsets[i - 1], b_n + 1)), 0); + } + } } static void fixed_string_vector_string_vector( @@ -114,10 +151,19 @@ struct EqualsStringImpl { size_t size = b_offsets.size(); for (size_t i = 0; i < size; ++i) - c[i] = (i == 0) - ? (b_offsets[0] == a_n + 1 && !memcmp(&b_data[0], &a_data[0], a_n)) - : (b_offsets[i] - b_offsets[i - 1] == a_n + 1 - && !memcmp(&b_data[b_offsets[i - 1]], &a_data[a_n * i], a_n)); + { + if (i == 0) + { + int res = memcmp(&a_data[0], &b_data[0], std::min(b_offsets[0] - 1, a_n)); + c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n + 1, b_offsets[0])); + } + else + { + int res = memcmp(&a_data[i * a_n], &b_data[b_offsets[i - 1]], + std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); + c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n + 1, b_offsets[i] - b_offsets[i - 1])); + } + } } static void fixed_string_vector_fixed_string_vector( @@ -127,7 +173,10 @@ struct EqualsStringImpl { size_t size = a_data.size(); for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - c[j] = a_n == b_n && !memcmp(&a_data[i], &b_data[i], a_n); + { + int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n)); + c[j] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n, b_n)); + } } static void fixed_string_vector_constant( @@ -139,7 +188,10 @@ struct EqualsStringImpl const UInt8 * b_data = reinterpret_cast(b.data()); ColumnString::Offset_t b_n = b.size(); for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - c[j] = a_n == b_n && !memcmp(&a_data[i], b_data, a_n); + { + int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n)); + c[j] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n, b_n)); + } } static void constant_string_vector( @@ -151,10 +203,17 @@ struct EqualsStringImpl ColumnString::Offset_t a_n = a.size(); const UInt8 * a_data = reinterpret_cast(a.data()); for (size_t i = 0; i < size; ++i) - c[i] = (i == 0) - ? (b_offsets[0] == a_n + 1 && !memcmp(&b_data[0], a_data, a_n)) - : (b_offsets[i] - b_offsets[i - 1] == a_n + 1 - && !memcmp(&b_data[b_offsets[i - 1]], a_data, a_n)); + { + if (i == 0) + { + c[i] = Op::apply(memcmp(a_data, &b_data[0], std::min(b_offsets[0], a_n + 1)), 0); + } + else + { + c[i] = Op::apply(memcmp(a_data, &b_data[b_offsets[i - 1]], + std::min(b_offsets[i] - b_offsets[i - 1], a_n + 1)), 0); + } + } } static void constant_fixed_string_vector( @@ -166,7 +225,10 @@ struct EqualsStringImpl const UInt8 * a_data = reinterpret_cast(a.data()); ColumnString::Offset_t a_n = a.size(); for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - c[j] = a_n == b_n && !memcmp(&b_data[i], a_data, b_n); + { + int res = memcmp(a_data, &b_data[i], std::min(a_n, b_n)); + c[j] = Op::apply(res, 0) || (res == 0 && Op::apply(b_n, a_n)); + } } static void constant_constant( @@ -174,41 +236,14 @@ struct EqualsStringImpl const std::string & b, UInt8 & c) { - c = a == b; + c = Op::apply(memcmp(a.data(), b.data(), std::min(a.size(), b.size()) + 1), 0); } }; -template -struct NotEqualsNumImpl -{ - static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] != b[i]; - } - static void vector_constant(const PODArray & a, B b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] != b; - } - - static void constant_vector(A a, const PODArray & b, PODArray & c) - { - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a != b[i]; - } - - static void constant_constant(A a, B b, UInt8 & c) - { - c = a != b; - } -}; - -struct NotEqualsStringImpl +/// Сравнения на равенство/неравенство реализованы несколько более эффективно. +template +struct StringEqualsImpl { static void string_vector_string_vector( const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, @@ -217,7 +252,7 @@ struct NotEqualsStringImpl { size_t size = a_offsets.size(); for (size_t i = 0; i < size; ++i) - c[i] = !((i == 0) + c[i] = positive == ((i == 0) ? (a_offsets[0] == b_offsets[0] && !memcmp(&a_data[0], &b_data[0], a_offsets[0] - 1)) : (a_offsets[i] - a_offsets[i - 1] == b_offsets[i] - b_offsets[i - 1] && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], a_offsets[i] - a_offsets[i - 1] - 1))); @@ -230,7 +265,7 @@ struct NotEqualsStringImpl { size_t size = a_offsets.size(); for (size_t i = 0; i < size; ++i) - c[i] = !((i == 0) + c[i] = positive == ((i == 0) ? (a_offsets[0] == b_n + 1 && !memcmp(&a_data[0], &b_data[0], b_n)) : (a_offsets[i] - a_offsets[i - 1] == b_n + 1 && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_n * i], b_n))); @@ -245,25 +280,12 @@ struct NotEqualsStringImpl ColumnString::Offset_t b_n = b.size(); const UInt8 * b_data = reinterpret_cast(b.data()); for (size_t i = 0; i < size; ++i) - c[i] = !((i == 0) + c[i] = positive == ((i == 0) ? (a_offsets[0] == b_n + 1 && !memcmp(&a_data[0], b_data, b_n)) : (a_offsets[i] - a_offsets[i - 1] == b_n + 1 && !memcmp(&a_data[a_offsets[i - 1]], b_data, b_n))); } - static void fixed_string_vector_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - for (size_t i = 0; i < size; ++i) - c[i] = !((i == 0) - ? (b_offsets[0] == a_n + 1 && !memcmp(&b_data[0], &a_data[0], a_n)) - : (b_offsets[i] - b_offsets[i - 1] == a_n + 1 - && !memcmp(&b_data[b_offsets[i - 1]], &a_data[a_n * i], a_n))); - } - static void fixed_string_vector_fixed_string_vector( const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, @@ -271,7 +293,7 @@ struct NotEqualsStringImpl { size_t size = a_data.size(); for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - c[j] = !(a_n == b_n && !memcmp(&a_data[i], &b_data[i], a_n)); + c[j] = positive == (a_n == b_n && !memcmp(&a_data[i], &b_data[i], a_n)); } static void fixed_string_vector_constant( @@ -283,34 +305,7 @@ struct NotEqualsStringImpl const UInt8 * b_data = reinterpret_cast(b.data()); ColumnString::Offset_t b_n = b.size(); for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - c[j] = !(a_n == b_n && !memcmp(&a_data[i], b_data, a_n)); - } - - static void constant_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - ColumnString::Offset_t a_n = a.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - for (size_t i = 0; i < size; ++i) - c[i] = !((i == 0) - ? (b_offsets[0] == a_n + 1 && !memcmp(&b_data[0], a_data, a_n)) - : (b_offsets[i] - b_offsets[i - 1] == a_n + 1 - && !memcmp(&b_data[b_offsets[i - 1]], a_data, a_n))); - } - - static void constant_fixed_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = b_data.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - ColumnString::Offset_t a_n = a.size(); - for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - c[j] = !(a_n == b_n && !memcmp(&b_data[i], a_data, b_n)); + c[j] = positive == (a_n == b_n && !memcmp(&a_data[i], b_data, a_n)); } static void constant_constant( @@ -318,108 +313,7 @@ struct NotEqualsStringImpl const std::string & b, UInt8 & c) { - c = !(a == b); - } -}; - -template -struct LessNumImpl -{ - static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] < b[i]; - } - - static void vector_constant(const PODArray & a, B b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] < b; - } - - static void constant_vector(A a, const PODArray & b, PODArray & c) - { - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a < b[i]; - } - - static void constant_constant(A a, B b, UInt8 & c) - { - c = a < b; - } -}; - -struct LessStringImpl -{ - static void string_vector_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0], b_offsets[0]) - 1); - c[i] = res < 0 || (res == 0 && a_offsets[0] < b_offsets[0]); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], - std::min(a_offsets[i] - a_offsets[i - 1], b_offsets[i] - b_offsets[i - 1]) - 1); - c[i] = res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_offsets[i] - b_offsets[i - 1]); - } - } - } - - static void string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0] - 1, b_n)); - c[i] = res < 0 || (res == 0 && a_offsets[0] < b_n + 1); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n], - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_n + 1); - } - } - } - - static void string_vector_constant( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const std::string & b, - PODArray & c) - { - size_t size = a_offsets.size(); - ColumnString::Offset_t b_n = b.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], b_data, std::min(a_offsets[0] - 1, b_n)); - c[i] = res < 0 || (res == 0 && a_offsets[0] < b_n + 1); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], b_data, - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_n + 1); - } - } + c = positive == (a == b); } static void fixed_string_vector_string_vector( @@ -427,49 +321,7 @@ struct LessStringImpl const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, PODArray & c) { - size_t size = b_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = res < 0 || (res == 0 && a_n + 1 < b_offsets[0]); - } - else - { - int res = memcmp(&a_data[i * a_n], &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = res < 0 || (res == 0 && a_n + 1 < b_offsets[i] - b_offsets[i - 1]); - } - } - } - - static void fixed_string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_data.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n)); - c[j] = res < 0 || (res == 0 && a_n < b_n); - } - } - - static void fixed_string_vector_constant( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const std::string & b, - PODArray & c) - { - size_t size = a_data.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - ColumnString::Offset_t b_n = b.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n)); - c[j] = res < 0 || (res == 0 && a_n < b_n); - } + string_vector_fixed_string_vector(b_data, b_offsets, a_data, a_n, c); } static void constant_string_vector( @@ -477,23 +329,7 @@ struct LessStringImpl const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, PODArray & c) { - size_t size = b_offsets.size(); - ColumnString::Offset_t a_n = a.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(a_data, &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = res < 0 || (res == 0 && a_n + 1 < b_offsets[0]); - } - else - { - int res = memcmp(a_data, &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = res < 0 || (res == 0 && a_n + 1 < b_offsets[i] - b_offsets[i - 1]); - } - } + string_vector_constant(b_data, b_offsets, a, c); } static void constant_fixed_string_vector( @@ -501,626 +337,20 @@ struct LessStringImpl const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, PODArray & c) { - size_t size = b_data.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - ColumnString::Offset_t a_n = a.size(); - for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - { - int res = memcmp(a_data, &b_data[i], std::min(a_n, b_n)); - c[j] = res < 0 || (res == 0 && b_n < a_n); - } - } - - static void constant_constant( - const std::string & a, - const std::string & b, - UInt8 & c) - { - c = a < b; - } -}; - -template -struct GreaterNumImpl -{ - static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] > b[i]; - } - - static void vector_constant(const PODArray & a, B b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] > b; - } - - static void constant_vector(A a, const PODArray & b, PODArray & c) - { - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a > b[i]; - } - - static void constant_constant(A a, B b, UInt8 & c) - { - c = a > b; - } -}; - -struct GreaterStringImpl -{ - static void string_vector_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0], b_offsets[0]) - 1); - c[i] = res > 0 || (res == 0 && a_offsets[0] > b_offsets[0]); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], - std::min(a_offsets[i] - a_offsets[i - 1], b_offsets[i] - b_offsets[i - 1]) - 1); - c[i] = res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_offsets[i] - b_offsets[i - 1]); - } - } - } - - static void string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0] - 1, b_n)); - c[i] = res > 0 || (res == 0 && a_offsets[0] > b_n + 1); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n], - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_n + 1); - } - } - } - - static void string_vector_constant( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const std::string & b, - PODArray & c) - { - size_t size = a_offsets.size(); - ColumnString::Offset_t b_n = b.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], b_data, std::min(a_offsets[0] - 1, b_n)); - c[i] = res > 0 || (res == 0 && a_offsets[0] > b_n + 1); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], b_data, - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_n + 1); - } - } - } - - static void fixed_string_vector_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = res > 0 || (res == 0 && a_n + 1 > b_offsets[0]); - } - else - { - int res = memcmp(&a_data[i * a_n], &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = res > 0 || (res == 0 && a_n + 1 > b_offsets[i] - b_offsets[i - 1]); - } - } - } - - static void fixed_string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_data.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n)); - c[j] = res > 0 || (res == 0 && a_n > b_n); - } - } - - static void fixed_string_vector_constant( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const std::string & b, - PODArray & c) - { - size_t size = a_data.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - ColumnString::Offset_t b_n = b.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n)); - c[j] = res > 0 || (res == 0 && a_n > b_n); - } - } - - static void constant_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - ColumnString::Offset_t a_n = a.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(a_data, &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = res > 0 || (res == 0 && a_n + 1 > b_offsets[0]); - } - else - { - int res = memcmp(a_data, &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = res > 0 || (res == 0 && a_n + 1 > b_offsets[i] - b_offsets[i - 1]); - } - } - } - - static void constant_fixed_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = b_data.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - ColumnString::Offset_t a_n = a.size(); - for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - { - int res = memcmp(a_data, &b_data[i], std::min(a_n, b_n)); - c[j] = res > 0 || (res == 0 && b_n > a_n); - } - } - - static void constant_constant( - const std::string & a, - const std::string & b, - UInt8 & c) - { - c = a > b; - } -}; - -template -struct LessOrEqualsNumImpl -{ - static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] <= b[i]; - } - - static void vector_constant(const PODArray & a, B b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] <= b; - } - - static void constant_vector(A a, const PODArray & b, PODArray & c) - { - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a <= b[i]; - } - - static void constant_constant(A a, B b, UInt8 & c) - { - c = a <= b; - } -}; - -struct LessOrEqualsStringImpl -{ - static void string_vector_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0], b_offsets[0]) - 1); - c[i] = !(res > 0 || (res == 0 && a_offsets[0] > b_offsets[0])); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], - std::min(a_offsets[i] - a_offsets[i - 1], b_offsets[i] - b_offsets[i - 1]) - 1); - c[i] = !(res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0] - 1, b_n)); - c[i] = !(res > 0 || (res == 0 && a_offsets[0] > b_n + 1)); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n], - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = !(res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_n + 1)); - } - } - } - - static void string_vector_constant( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const std::string & b, - PODArray & c) - { - size_t size = a_offsets.size(); - ColumnString::Offset_t b_n = b.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], b_data, std::min(a_offsets[0] - 1, b_n)); - c[i] = !(res > 0 || (res == 0 && a_offsets[0] > b_n + 1)); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], b_data, - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = !(res > 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] > b_n + 1)); - } - } - } - - static void fixed_string_vector_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = !(res > 0 || (res == 0 && a_n + 1 > b_offsets[0])); - } - else - { - int res = memcmp(&a_data[i * a_n], &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = !(res > 0 || (res == 0 && a_n + 1 > b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void fixed_string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_data.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n)); - c[j] = !(res > 0 || (res == 0 && a_n > b_n)); - } - } - - static void fixed_string_vector_constant( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const std::string & b, - PODArray & c) - { - size_t size = a_data.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - ColumnString::Offset_t b_n = b.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n)); - c[j] = !(res > 0 || (res == 0 && a_n > b_n)); - } - } - - static void constant_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - ColumnString::Offset_t a_n = a.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(a_data, &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = !(res > 0 || (res == 0 && a_n + 1 > b_offsets[0])); - } - else - { - int res = memcmp(a_data, &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = !(res > 0 || (res == 0 && a_n + 1 > b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void constant_fixed_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = b_data.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - ColumnString::Offset_t a_n = a.size(); - for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - { - int res = memcmp(a_data, &b_data[i], std::min(a_n, b_n)); - c[j] = !(res > 0 || (res == 0 && b_n > a_n)); - } - } - - static void constant_constant( - const std::string & a, - const std::string & b, - UInt8 & c) - { - c = a <= b; - } -}; - -template -struct GreaterOrEqualsNumImpl -{ - static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] >= b[i]; - } - - static void vector_constant(const PODArray & a, B b, PODArray & c) - { - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a[i] >= b; - } - - static void constant_vector(A a, const PODArray & b, PODArray & c) - { - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = a >= b[i]; - } - - static void constant_constant(A a, B b, UInt8 & c) - { - c = a >= b; - } -}; - -struct GreaterOrEqualsStringImpl -{ - static void string_vector_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0], b_offsets[0]) - 1); - c[i] = !(res < 0 || (res == 0 && a_offsets[0] < b_offsets[0])); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], - std::min(a_offsets[i] - a_offsets[i - 1], b_offsets[i] - b_offsets[i - 1]) - 1); - c[i] = !(res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(a_offsets[0] - 1, b_n)); - c[i] = !(res < 0 || (res == 0 && a_offsets[0] < b_n + 1)); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n], - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = !(res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_n + 1)); - } - } - } - - static void string_vector_constant( - const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets, - const std::string & b, - PODArray & c) - { - size_t size = a_offsets.size(); - ColumnString::Offset_t b_n = b.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], b_data, std::min(a_offsets[0] - 1, b_n)); - c[i] = !(res < 0 || (res == 0 && a_offsets[0] < b_n + 1)); - } - else - { - int res = memcmp(&a_data[a_offsets[i - 1]], b_data, - std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n)); - c[i] = !(res < 0 || (res == 0 && a_offsets[i] - a_offsets[i - 1] < b_n + 1)); - } - } - } - - static void fixed_string_vector_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(&a_data[0], &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = !(res < 0 || (res == 0 && a_n + 1 < b_offsets[0])); - } - else - { - int res = memcmp(&a_data[i * a_n], &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = !(res < 0 || (res == 0 && a_n + 1 < b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void fixed_string_vector_fixed_string_vector( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = a_data.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n)); - c[j] = !(res < 0 || (res == 0 && a_n < b_n)); - } - } - - static void fixed_string_vector_constant( - const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n, - const std::string & b, - PODArray & c) - { - size_t size = a_data.size(); - const UInt8 * b_data = reinterpret_cast(b.data()); - ColumnString::Offset_t b_n = b.size(); - for (size_t i = 0, j = 0; i < size; i += a_n, ++j) - { - int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n)); - c[j] = !(res < 0 || (res == 0 && a_n < b_n)); - } - } - - static void constant_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets, - PODArray & c) - { - size_t size = b_offsets.size(); - ColumnString::Offset_t a_n = a.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - for (size_t i = 0; i < size; ++i) - { - if (i == 0) - { - int res = memcmp(a_data, &b_data[0], std::min(b_offsets[0] - 1, a_n)); - c[i] = !(res < 0 || (res == 0 && a_n + 1 < b_offsets[0])); - } - else - { - int res = memcmp(a_data, &b_data[b_offsets[i - 1]], - std::min(b_offsets[i] - b_offsets[i - 1] - 1, a_n)); - c[i] = !(res < 0 || (res == 0 && a_n + 1 < b_offsets[i] - b_offsets[i - 1])); - } - } - } - - static void constant_fixed_string_vector( - const std::string & a, - const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n, - PODArray & c) - { - size_t size = b_data.size(); - const UInt8 * a_data = reinterpret_cast(a.data()); - ColumnString::Offset_t a_n = a.size(); - for (size_t i = 0, j = 0; i < size; i += b_n, ++j) - { - int res = memcmp(a_data, &b_data[i], std::min(a_n, b_n)); - c[j] = !(res < 0 || (res == 0 && b_n < a_n)); - } - } - - static void constant_constant( - const std::string & a, - const std::string & b, - UInt8 & c) - { - c = a >= b; + fixed_string_vector_constant(b_data, b_n, a, c); } }; -#pragma GCC diagnostic pop +template +struct StringComparisonImpl> : StringEqualsImpl {}; + +template +struct StringComparisonImpl> : StringEqualsImpl {}; template < - template class NumImpl, - typename StringImpl, + template class Op, typename Name> class FunctionComparison : public IFunction { @@ -1136,7 +366,7 @@ private: ColumnUInt8::Container_t & vec_res = col_res->getData(); vec_res.resize(col_left->getData().size()); - NumImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); + NumComparisonImpl>::vector_vector(col_left->getData(), col_right->getData(), vec_res); return true; } @@ -1147,7 +377,7 @@ private: ColumnUInt8::Container_t & vec_res = col_res->getData(); vec_res.resize(col_left->getData().size()); - NumImpl::vector_constant(col_left->getData(), col_right->getData(), vec_res); + NumComparisonImpl>::vector_constant(col_left->getData(), col_right->getData(), vec_res); return true; } @@ -1165,14 +395,14 @@ private: ColumnUInt8::Container_t & vec_res = col_res->getData(); vec_res.resize(col_left->size()); - NumImpl::constant_vector(col_left->getData(), col_right->getData(), vec_res); + NumComparisonImpl>::constant_vector(col_left->getData(), col_right->getData(), vec_res); return true; } else if (ColumnConst * col_right = typeid_cast *>(&*block.getByPosition(arguments[1]).column)) { UInt8 res = 0; - NumImpl::constant_constant(col_left->getData(), col_right->getData(), res); + NumComparisonImpl>::constant_constant(col_left->getData(), col_right->getData(), res); ColumnConstUInt8 * col_res = new ColumnConstUInt8(col_left->size(), res); block.getByPosition(result).column = col_res; @@ -1238,6 +468,8 @@ private: ColumnConstString * c0_const = typeid_cast(c0); ColumnConstString * c1_const = typeid_cast(c1); + using StringImpl = StringComparisonImpl>; + if (c0_const && c1_const) { ColumnConstUInt8 * c_res = new ColumnConstUInt8(c0_const->size(), 0); @@ -1359,11 +591,11 @@ struct NameGreater { static const char * get() { return "greater"; } }; struct NameLessOrEquals { static const char * get() { return "lessOrEquals"; } }; struct NameGreaterOrEquals { static const char * get() { return "greaterOrEquals"; } }; -typedef FunctionComparison FunctionEquals; -typedef FunctionComparison FunctionNotEquals; -typedef FunctionComparison FunctionLess; -typedef FunctionComparison FunctionGreater; -typedef FunctionComparison FunctionLessOrEquals; -typedef FunctionComparison FunctionGreaterOrEquals; +typedef FunctionComparison FunctionEquals; +typedef FunctionComparison FunctionNotEquals; +typedef FunctionComparison FunctionLess; +typedef FunctionComparison FunctionGreater; +typedef FunctionComparison FunctionLessOrEquals; +typedef FunctionComparison FunctionGreaterOrEquals; } From aade03d427ed9b6bc4e61ad16854586598137f17 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 09:12:46 +0400 Subject: [PATCH 082/127] dbms: fixed error [#METR-2944]. --- dbms/src/Columns/IColumn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Columns/IColumn.cpp b/dbms/src/Columns/IColumn.cpp index e348cf79a3b..c7a5322f0f4 100644 --- a/dbms/src/Columns/IColumn.cpp +++ b/dbms/src/Columns/IColumn.cpp @@ -22,7 +22,7 @@ size_t countBytesInFilter(const IColumn::Filter & filt) const Int8 * end64 = pos + filt.size() / 64 * 64; for (; pos < end64; pos += 64) - count += __builtin_popcount( + count += __builtin_popcountll( static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_loadu_si128(reinterpret_cast(pos)), zero16))) From bbd6064a1939e0279b291f50cdb947315b46e9e6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 09:41:28 +0400 Subject: [PATCH 083/127] dbms: improved performance of comparisons [#METR-2944]. --- .../DB/Functions/FunctionsComparison.h | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/dbms/include/DB/Functions/FunctionsComparison.h b/dbms/include/DB/Functions/FunctionsComparison.h index f91e1e966eb..adeee4a6d17 100644 --- a/dbms/include/DB/Functions/FunctionsComparison.h +++ b/dbms/include/DB/Functions/FunctionsComparison.h @@ -50,23 +50,54 @@ struct NumComparisonImpl { static void vector_vector(const PODArray & a, const PODArray & b, PODArray & c) { + /** GCC 4.8.2 векторизует цикл только если его записать в такой форме. + * В данном случае, если сделать цикл по индексу массива (код будет выглядеть проще), + * цикл не будет векторизовываться. + */ + size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = Op::apply(a[i], b[i]); + const A * a_pos = &a[0]; + const B * b_pos = &b[0]; + UInt8 * c_pos = &c[0]; + const A * a_end = a_pos + size; + + while (a_pos < a_end) + { + *c_pos = Op::apply(*a_pos, *b_pos); + ++a_pos; + ++b_pos; + ++c_pos; + } } static void vector_constant(const PODArray & a, B b, PODArray & c) { size_t size = a.size(); - for (size_t i = 0; i < size; ++i) - c[i] = Op::apply(a[i], b); + const A * a_pos = &a[0]; + UInt8 * c_pos = &c[0]; + const A * a_end = a_pos + size; + + while (a_pos < a_end) + { + *c_pos = Op::apply(*a_pos, b); + ++a_pos; + ++c_pos; + } } static void constant_vector(A a, const PODArray & b, PODArray & c) { size_t size = b.size(); - for (size_t i = 0; i < size; ++i) - c[i] = Op::apply(a, b[i]); + const B * b_pos = &b[0]; + UInt8 * c_pos = &c[0]; + const B * b_end = b_pos + size; + + while (b_pos < b_end) + { + *c_pos = Op::apply(a, *b_pos); + ++b_pos; + ++c_pos; + } } static void constant_constant(A a, B b, UInt8 & c) From 2d2c4c932e1cb3f0eab735234e8ad7c3779c0d55 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 11:39:28 +0400 Subject: [PATCH 084/127] dbms: improved performance of filtering [#METR-2944]. --- dbms/include/DB/Columns/ColumnVector.h | 46 +++++++++++++++++++++++--- dbms/include/DB/Common/PODArray.h | 6 ++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/dbms/include/DB/Columns/ColumnVector.h b/dbms/include/DB/Columns/ColumnVector.h index 1faa5eaf662..e7da039aaf7 100644 --- a/dbms/include/DB/Columns/ColumnVector.h +++ b/dbms/include/DB/Columns/ColumnVector.h @@ -222,11 +222,49 @@ public: typename Self::Container_t & res_data = res_->getData(); res_data.reserve(size); - for (size_t i = 0; i < size; ++i) - if (filt[i]) - res_data.push_back(data[i]); + /** Чуть более оптимизированная версия. + * Исходит из допущения, что часто куски последовательно идущих значений + * полностью проходят или полностью не проходят фильтр. + * Поэтому, будем оптимистично проверять куски по 16 значений. + */ + const UInt8 * filt_pos = &filt[0]; + const UInt8 * filt_end = filt_pos + size; + const UInt8 * filt_end_sse = filt_pos + size / 16 * 16; + const T * data_pos = &data[0]; - return res; + while (filt_pos < filt_end_sse) + { + int mask = _mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos))); + + if (0 == mask) + { + /// Ничего не вставляем. + } + else if (0xFFFF == mask) + { + res_data.insert_assume_reserved(data_pos, data_pos + 16); + } + else + { + for (size_t i = 0; i < 16; ++i) + if (filt_pos[i]) + res_data.push_back(data_pos[i]); + } + + filt_pos += 16; + data_pos += 16; + } + + while (filt_pos < filt_end) + { + if (*filt_pos) + res_data.push_back(*data_pos); + + ++filt_pos; + ++data_pos; + } + + return res; } ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const diff --git a/dbms/include/DB/Common/PODArray.h b/dbms/include/DB/Common/PODArray.h index 93fe1ad839f..37fabed0a0d 100644 --- a/dbms/include/DB/Common/PODArray.h +++ b/dbms/include/DB/Common/PODArray.h @@ -248,6 +248,12 @@ public: if (required_capacity > capacity()) reserve(round_up_to_power_of_two(required_capacity)); + insert_assume_reserved(from_begin, from_end); + } + + template + void insert_assume_reserved(It1 from_begin, It2 from_end) + { size_t bytes_to_copy = byte_size(from_end - from_begin); memcpy(c_end, reinterpret_cast(&*from_begin), bytes_to_copy); c_end += bytes_to_copy; From dbe7ce299023b456be7a4802d14977e649285998 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 12:28:03 +0400 Subject: [PATCH 085/127] dbms: faster rand and rand64 functions [#METR-2944]. --- dbms/include/DB/Functions/FunctionsRandom.h | 107 +++++++++++++++----- 1 file changed, 81 insertions(+), 26 deletions(-) diff --git a/dbms/include/DB/Functions/FunctionsRandom.h b/dbms/include/DB/Functions/FunctionsRandom.h index d9ce0121664..48080058a24 100644 --- a/dbms/include/DB/Functions/FunctionsRandom.h +++ b/dbms/include/DB/Functions/FunctionsRandom.h @@ -4,6 +4,7 @@ #include #include +#include #include @@ -13,14 +14,14 @@ namespace DB /** Функции генерации псевдослучайных чисел. * Функция может быть вызвана без аргументов или с одним аргументом. * Аргумент игнорируется и служит лишь для того, чтобы несколько вызовов одной функции считались разными и не склеивались. - * + * * Пример: * SELECT rand(), rand() - выдаст два одинаковых столбца. * SELECT rand(1), rand(2) - выдаст два разных столбца. * * Некриптографические генераторы: - * - * rand - linear congruental generator 0 .. 2^31 - 1. + * + * rand - linear congruental generator 0 .. 2^32 - 1. * rand64 - комбинирует несколько значений rand, чтобы получить значения из диапазона 0 .. 2^64 - 1. * * В качестве затравки используют время. @@ -30,31 +31,74 @@ namespace DB namespace detail { - void seed(drand48_data & rand_state, intptr_t additional_seed) + struct LinearCongruentialGenerator + { + /// Константы из man lrand48_r. + static constexpr UInt64 a = 0x5DEECE66D; + static constexpr UInt64 c = 0xB; + + /// А эта - из head -c8 /dev/urandom | xxd -p + UInt64 current = 0x09826f4a081cee35ULL; + + LinearCongruentialGenerator() {} + LinearCongruentialGenerator(UInt64 value) : current(value) {} + + void seed(UInt64 value) + { + current = value; + } + + UInt32 next() + { + current = current * a + c; + return current >> 16; + } + }; + + void seed(LinearCongruentialGenerator & generator, intptr_t additional_seed) { struct timespec times; if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, ×)) throwFromErrno("Cannot clock_gettime.", ErrorCodes::CANNOT_CLOCK_GETTIME); - srand48_r(intHash32<0>(times.tv_nsec ^ intHash32<0>(additional_seed)), &rand_state); + generator.seed(intHash64(times.tv_nsec ^ intHash64(additional_seed))); } } struct RandImpl { typedef UInt32 ReturnType; - + static void execute(PODArray & res) { - drand48_data rand_state; - detail::seed(rand_state, reinterpret_cast(&res[0])); - + detail::LinearCongruentialGenerator generator0; + detail::LinearCongruentialGenerator generator1; + detail::LinearCongruentialGenerator generator2; + detail::LinearCongruentialGenerator generator3; + + detail::seed(generator0, 0xfb4121280b2ab902ULL + reinterpret_cast(&res[0])); + detail::seed(generator1, 0x0121cf76df39c673ULL + reinterpret_cast(&res[0])); + detail::seed(generator2, 0x17ae86e3a19a602fULL + reinterpret_cast(&res[0])); + detail::seed(generator3, 0x8b6e16da7e06d622ULL + reinterpret_cast(&res[0])); + size_t size = res.size(); - for (size_t i = 0; i < size; ++i) + ReturnType * pos = &res[0]; + ReturnType * end = pos + size; + ReturnType * end4 = pos + size / 4 * 4; + + while (pos < end4) { - long rand_res; - lrand48_r(&rand_state, &rand_res); - res[i] = rand_res; + pos[0] = generator0.next(); + pos[1] = generator1.next(); + pos[2] = generator2.next(); + pos[3] = generator3.next(); + pos += 4; + } + + while (pos < end) + { + pos[0] = generator0.next(); + ++pos; } } }; @@ -65,21 +109,32 @@ struct Rand64Impl static void execute(PODArray & res) { - drand48_data rand_state; - detail::seed(rand_state, reinterpret_cast(&res[0])); + detail::LinearCongruentialGenerator generator0; + detail::LinearCongruentialGenerator generator1; + detail::LinearCongruentialGenerator generator2; + detail::LinearCongruentialGenerator generator3; + + detail::seed(generator0, 0xfb4121280b2ab902ULL + reinterpret_cast(&res[0])); + detail::seed(generator1, 0x0121cf76df39c673ULL + reinterpret_cast(&res[0])); + detail::seed(generator2, 0x17ae86e3a19a602fULL + reinterpret_cast(&res[0])); + detail::seed(generator3, 0x8b6e16da7e06d622ULL + reinterpret_cast(&res[0])); size_t size = res.size(); - for (size_t i = 0; i < size; ++i) + ReturnType * pos = &res[0]; + ReturnType * end = pos + size; + ReturnType * end2 = pos + size / 2 * 2; + + while (pos < end2) { - long rand_res1; - long rand_res2; - long rand_res3; - - lrand48_r(&rand_state, &rand_res1); - lrand48_r(&rand_state, &rand_res2); - lrand48_r(&rand_state, &rand_res3); - - res[i] = rand_res1 ^ (rand_res2 << 18) ^ (rand_res3 << 33); + pos[0] = (static_cast(generator0.next()) << 32) | generator1.next(); + pos[1] = (static_cast(generator2.next()) << 32) | generator3.next(); + pos += 2; + } + + while (pos < end) + { + pos[0] = (static_cast(generator0.next()) << 32) | generator1.next(); + ++pos; } } }; @@ -90,7 +145,7 @@ class FunctionRandom : public IFunction { private: typedef typename Impl::ReturnType ToType; - + public: /// Получить имя функции. String getName() const From 2fe534aade30e000519e45ea6386a82f878bb743 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 17 Aug 2014 13:18:47 +0400 Subject: [PATCH 086/127] dbms: fixed error; suddenly faster [#METR-2944]. --- dbms/include/DB/Columns/ColumnVector.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/include/DB/Columns/ColumnVector.h b/dbms/include/DB/Columns/ColumnVector.h index e7da039aaf7..02d6fd96e89 100644 --- a/dbms/include/DB/Columns/ColumnVector.h +++ b/dbms/include/DB/Columns/ColumnVector.h @@ -232,9 +232,11 @@ public: const UInt8 * filt_end_sse = filt_pos + size / 16 * 16; const T * data_pos = &data[0]; + const __m128i zero16 = _mm_set1_epi8(0); + while (filt_pos < filt_end_sse) { - int mask = _mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos))); + int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos)), zero16)); if (0 == mask) { From 6a3f2047059b00b99b6db8097a47d2b7ebf1122c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 18 Aug 2014 04:07:05 +0400 Subject: [PATCH 087/127] dbms: little better [#METR-2944]. --- dbms/src/Functions/FunctionFactory.cpp | 389 +++++++++++++------------ 1 file changed, 198 insertions(+), 191 deletions(-) diff --git a/dbms/src/Functions/FunctionFactory.cpp b/dbms/src/Functions/FunctionFactory.cpp index e5dcceec489..ceb4b237c7e 100644 --- a/dbms/src/Functions/FunctionFactory.cpp +++ b/dbms/src/Functions/FunctionFactory.cpp @@ -31,218 +31,225 @@ FunctionPtr FunctionFactory::get( const String & name, const Context & context) const { - /// Немного неоптимально. + static const std::unordered_map< + std::string, + std::function> functions = + { +#define F [](const Context & context) + {"plus", F { return new FunctionPlus; } }, + {"minus", F { return new FunctionMinus; } }, + {"multiply", F { return new FunctionMultiply; } }, + {"divide", F { return new FunctionDivideFloating; } }, + {"intDiv", F { return new FunctionDivideIntegral; } }, + {"modulo", F { return new FunctionModulo; } }, + {"negate", F { return new FunctionNegate; } }, + {"bitAnd", F { return new FunctionBitAnd; } }, + {"bitOr", F { return new FunctionBitOr; } }, + {"bitXor", F { return new FunctionBitXor; } }, + {"bitNot", F { return new FunctionBitNot; } }, + {"bitShiftLeft", F { return new FunctionBitShiftLeft; } }, + {"bitShiftRight", F { return new FunctionBitShiftRight; } }, - if (name == "plus") return new FunctionPlus; - else if (name == "minus") return new FunctionMinus; - else if (name == "multiply") return new FunctionMultiply; - else if (name == "divide") return new FunctionDivideFloating; - else if (name == "intDiv") return new FunctionDivideIntegral; - else if (name == "modulo") return new FunctionModulo; - else if (name == "negate") return new FunctionNegate; - else if (name == "bitAnd") return new FunctionBitAnd; - else if (name == "bitOr") return new FunctionBitOr; - else if (name == "bitXor") return new FunctionBitXor; - else if (name == "bitNot") return new FunctionBitNot; - else if (name == "bitShiftLeft") return new FunctionBitShiftLeft; - else if (name == "bitShiftRight") return new FunctionBitShiftRight; + {"equals", F { return new FunctionEquals; } }, + {"notEquals", F { return new FunctionNotEquals; } }, + {"less", F { return new FunctionLess; } }, + {"greater", F { return new FunctionGreater; } }, + {"lessOrEquals", F { return new FunctionLessOrEquals; } }, + {"greaterOrEquals", F { return new FunctionGreaterOrEquals; } }, - else if (name == "equals") return new FunctionEquals; - else if (name == "notEquals") return new FunctionNotEquals; - else if (name == "less") return new FunctionLess; - else if (name == "greater") return new FunctionGreater; - else if (name == "lessOrEquals") return new FunctionLessOrEquals; - else if (name == "greaterOrEquals") return new FunctionGreaterOrEquals; + {"and", F { return new FunctionAnd; } }, + {"or", F { return new FunctionOr; } }, + {"xor", F { return new FunctionXor; } }, + {"not", F { return new FunctionNot; } }, - else if (name == "and") return new FunctionAnd; - else if (name == "or") return new FunctionOr; - else if (name == "xor") return new FunctionXor; - else if (name == "not") return new FunctionNot; + {"roundToExp2", F { return new FunctionRoundToExp2; } }, + {"roundDuration", F { return new FunctionRoundDuration; } }, + {"roundAge", F { return new FunctionRoundAge; } }, - else if (name == "roundToExp2") return new FunctionRoundToExp2; - else if (name == "roundDuration") return new FunctionRoundDuration; - else if (name == "roundAge") return new FunctionRoundAge; + {"empty", F { return new FunctionEmpty; } }, + {"notEmpty", F { return new FunctionNotEmpty; } }, + {"length", F { return new FunctionLength; } }, + {"lengthUTF8", F { return new FunctionLengthUTF8; } }, + {"lower", F { return new FunctionLower; } }, + {"upper", F { return new FunctionUpper; } }, + {"lowerUTF8", F { return new FunctionLowerUTF8; } }, + {"upperUTF8", F { return new FunctionUpperUTF8; } }, + {"reverse", F { return new FunctionReverse; } }, + {"reverseUTF8", F { return new FunctionReverseUTF8; } }, + {"concat", F { return new FunctionConcat; } }, + {"substring", F { return new FunctionSubstring; } }, + {"replaceOne", F { return new FunctionReplaceOne; } }, + {"replaceAll", F { return new FunctionReplaceAll; } }, + {"replaceRegexpOne", F { return new FunctionReplaceRegexpOne; } }, + {"replaceRegexpAll", F { return new FunctionReplaceRegexpAll; } }, + {"substringUTF8", F { return new FunctionSubstringUTF8; } }, - else if (name == "empty") return new FunctionEmpty; - else if (name == "notEmpty") return new FunctionNotEmpty; - else if (name == "length") return new FunctionLength; - else if (name == "lengthUTF8") return new FunctionLengthUTF8; - else if (name == "lower") return new FunctionLower; - else if (name == "upper") return new FunctionUpper; - else if (name == "lowerUTF8") return new FunctionLowerUTF8; - else if (name == "upperUTF8") return new FunctionUpperUTF8; - else if (name == "reverse") return new FunctionReverse; - else if (name == "reverseUTF8") return new FunctionReverseUTF8; - else if (name == "concat") return new FunctionConcat; - else if (name == "substring") return new FunctionSubstring; - else if (name == "replaceOne") return new FunctionReplaceOne; - else if (name == "replaceAll") return new FunctionReplaceAll; - else if (name == "replaceRegexpOne") return new FunctionReplaceRegexpOne; - else if (name == "replaceRegexpAll") return new FunctionReplaceRegexpAll; - else if (name == "substringUTF8") return new FunctionSubstringUTF8; + {"toUInt8", F { return new FunctionToUInt8; } }, + {"toUInt16", F { return new FunctionToUInt16; } }, + {"toUInt32", F { return new FunctionToUInt32; } }, + {"toUInt64", F { return new FunctionToUInt64; } }, + {"toInt8", F { return new FunctionToInt8; } }, + {"toInt16", F { return new FunctionToInt16; } }, + {"toInt32", F { return new FunctionToInt32; } }, + {"toInt64", F { return new FunctionToInt64; } }, + {"toFloat32", F { return new FunctionToFloat32; } }, + {"toFloat64", F { return new FunctionToFloat64; } }, + {"toDate", F { return new FunctionToDate; } }, + {"toDateTime", F { return new FunctionToDateTime; } }, + {"toString", F { return new FunctionToString; } }, + {"toFixedString", F { return new FunctionToFixedString; } }, + {"toStringCutToZero", F { return new FunctionToStringCutToZero; } }, - else if (name == "toUInt8") return new FunctionToUInt8; - else if (name == "toUInt16") return new FunctionToUInt16; - else if (name == "toUInt32") return new FunctionToUInt32; - else if (name == "toUInt64") return new FunctionToUInt64; - else if (name == "toInt8") return new FunctionToInt8; - else if (name == "toInt16") return new FunctionToInt16; - else if (name == "toInt32") return new FunctionToInt32; - else if (name == "toInt64") return new FunctionToInt64; - else if (name == "toFloat32") return new FunctionToFloat32; - else if (name == "toFloat64") return new FunctionToFloat64; - else if (name == "toDate") return new FunctionToDate; - else if (name == "toDateTime") return new FunctionToDateTime; - else if (name == "toString") return new FunctionToString; - else if (name == "toFixedString") return new FunctionToFixedString; - else if (name == "toStringCutToZero") return new FunctionToStringCutToZero; + {"reinterpretAsUInt8", F { return new FunctionReinterpretAsUInt8; } }, + {"reinterpretAsUInt16", F { return new FunctionReinterpretAsUInt16; } }, + {"reinterpretAsUInt32", F { return new FunctionReinterpretAsUInt32; } }, + {"reinterpretAsUInt64", F { return new FunctionReinterpretAsUInt64; } }, + {"reinterpretAsInt8", F { return new FunctionReinterpretAsInt8; } }, + {"reinterpretAsInt16", F { return new FunctionReinterpretAsInt16; } }, + {"reinterpretAsInt32", F { return new FunctionReinterpretAsInt32; } }, + {"reinterpretAsInt64", F { return new FunctionReinterpretAsInt64; } }, + {"reinterpretAsFloat32", F { return new FunctionReinterpretAsFloat32; } }, + {"reinterpretAsFloat64", F { return new FunctionReinterpretAsFloat64; } }, + {"reinterpretAsDate", F { return new FunctionReinterpretAsDate; } }, + {"reinterpretAsDateTime", F { return new FunctionReinterpretAsDateTime; } }, + {"reinterpretAsString", F { return new FunctionReinterpretAsString; } }, - else if (name == "reinterpretAsUInt8") return new FunctionReinterpretAsUInt8; - else if (name == "reinterpretAsUInt16") return new FunctionReinterpretAsUInt16; - else if (name == "reinterpretAsUInt32") return new FunctionReinterpretAsUInt32; - else if (name == "reinterpretAsUInt64") return new FunctionReinterpretAsUInt64; - else if (name == "reinterpretAsInt8") return new FunctionReinterpretAsInt8; - else if (name == "reinterpretAsInt16") return new FunctionReinterpretAsInt16; - else if (name == "reinterpretAsInt32") return new FunctionReinterpretAsInt32; - else if (name == "reinterpretAsInt64") return new FunctionReinterpretAsInt64; - else if (name == "reinterpretAsFloat32") return new FunctionReinterpretAsFloat32; - else if (name == "reinterpretAsFloat64") return new FunctionReinterpretAsFloat64; - else if (name == "reinterpretAsDate") return new FunctionReinterpretAsDate; - else if (name == "reinterpretAsDateTime") return new FunctionReinterpretAsDateTime; - else if (name == "reinterpretAsString") return new FunctionReinterpretAsString; + {"toYear", F { return new FunctionToYear; } }, + {"toMonth", F { return new FunctionToMonth; } }, + {"toDayOfMonth", F { return new FunctionToDayOfMonth; } }, + {"toDayOfWeek", F { return new FunctionToDayOfWeek; } }, + {"toHour", F { return new FunctionToHour; } }, + {"toMinute", F { return new FunctionToMinute; } }, + {"toSecond", F { return new FunctionToSecond; } }, + {"toMonday", F { return new FunctionToMonday; } }, + {"toStartOfMonth", F { return new FunctionToStartOfMonth; } }, + {"toStartOfQuarter", F { return new FunctionToStartOfQuarter; } }, + {"toStartOfYear", F { return new FunctionToStartOfYear; } }, + {"toStartOfMinute", F { return new FunctionToStartOfMinute; } }, + {"toStartOfHour", F { return new FunctionToStartOfHour; } }, + {"toRelativeYearNum", F { return new FunctionToRelativeYearNum; } }, + {"toRelativeMonthNum", F { return new FunctionToRelativeMonthNum; } }, + {"toRelativeWeekNum", F { return new FunctionToRelativeWeekNum; } }, + {"toRelativeDayNum", F { return new FunctionToRelativeDayNum; } }, + {"toRelativeHourNum", F { return new FunctionToRelativeHourNum; } }, + {"toRelativeMinuteNum", F { return new FunctionToRelativeMinuteNum; } }, + {"toRelativeSecondNum", F { return new FunctionToRelativeSecondNum; } }, + {"toTime", F { return new FunctionToTime; } }, + {"now", F { return new FunctionNow; } }, + {"timeSlot", F { return new FunctionTimeSlot; } }, + {"timeSlots", F { return new FunctionTimeSlots; } }, - else if (name == "toYear") return new FunctionToYear; - else if (name == "toMonth") return new FunctionToMonth; - else if (name == "toDayOfMonth") return new FunctionToDayOfMonth; - else if (name == "toDayOfWeek") return new FunctionToDayOfWeek; - else if (name == "toHour") return new FunctionToHour; - else if (name == "toMinute") return new FunctionToMinute; - else if (name == "toSecond") return new FunctionToSecond; - else if (name == "toMonday") return new FunctionToMonday; - else if (name == "toStartOfMonth") return new FunctionToStartOfMonth; - else if (name == "toStartOfQuarter") return new FunctionToStartOfQuarter; - else if (name == "toStartOfYear") return new FunctionToStartOfYear; - else if (name == "toStartOfMinute") return new FunctionToStartOfMinute; - else if (name == "toStartOfHour") return new FunctionToStartOfHour; - else if (name == "toRelativeYearNum") return new FunctionToRelativeYearNum; - else if (name == "toRelativeMonthNum") return new FunctionToRelativeMonthNum; - else if (name == "toRelativeWeekNum") return new FunctionToRelativeWeekNum; - else if (name == "toRelativeDayNum") return new FunctionToRelativeDayNum; - else if (name == "toRelativeHourNum") return new FunctionToRelativeHourNum; - else if (name == "toRelativeMinuteNum") return new FunctionToRelativeMinuteNum; - else if (name == "toRelativeSecondNum") return new FunctionToRelativeSecondNum; - else if (name == "toTime") return new FunctionToTime; - else if (name == "now") return new FunctionNow; - else if (name == "timeSlot") return new FunctionTimeSlot; - else if (name == "timeSlots") return new FunctionTimeSlots; + {"position", F { return new FunctionPosition; } }, + {"positionUTF8", F { return new FunctionPositionUTF8; } }, + {"match", F { return new FunctionMatch; } }, + {"like", F { return new FunctionLike; } }, + {"notLike", F { return new FunctionNotLike; } }, + {"extract", F { return new FunctionExtract; } }, + {"extractAll", F { return new FunctionExtractAll; } }, - else if (name == "position") return new FunctionPosition; - else if (name == "positionUTF8") return new FunctionPositionUTF8; - else if (name == "match") return new FunctionMatch; - else if (name == "like") return new FunctionLike; - else if (name == "notLike") return new FunctionNotLike; - else if (name == "extract") return new FunctionExtract; - else if (name == "extractAll") return new FunctionExtractAll; + {"halfMD5", F { return new FunctionHalfMD5; } }, + {"sipHash64", F { return new FunctionSipHash64; } }, + {"cityHash64", F { return new FunctionCityHash64; } }, + {"intHash32", F { return new FunctionIntHash32; } }, + {"intHash64", F { return new FunctionIntHash64; } }, - else if (name == "halfMD5") return new FunctionHalfMD5; - else if (name == "sipHash64") return new FunctionSipHash64; - else if (name == "cityHash64") return new FunctionCityHash64; - else if (name == "intHash32") return new FunctionIntHash32; - else if (name == "intHash64") return new FunctionIntHash64; + {"IPv4NumToString", F { return new FunctionIPv4NumToString; } }, + {"IPv4StringToNum", F { return new FunctionIPv4StringToNum; } }, + {"hex", F { return new FunctionHex; } }, + {"unhex", F { return new FunctionUnhex; } }, + {"bitmaskToList", F { return new FunctionBitmaskToList; } }, + {"bitmaskToArray", F { return new FunctionBitmaskToArray; } }, - else if (name == "IPv4NumToString") return new FunctionIPv4NumToString; - else if (name == "IPv4StringToNum") return new FunctionIPv4StringToNum; - else if (name == "hex") return new FunctionHex; - else if (name == "unhex") return new FunctionUnhex; - else if (name == "bitmaskToList") return new FunctionBitmaskToList; - else if (name == "bitmaskToArray") return new FunctionBitmaskToArray; + {"rand", F { return new FunctionRand; } }, + {"rand64", F { return new FunctionRand64; } }, - else if (name == "rand") return new FunctionRand; - else if (name == "rand64") return new FunctionRand64; + {"protocol", F { return new FunctionProtocol; } }, + {"domain", F { return new FunctionDomain; } }, + {"domainWithoutWWW", F { return new FunctionDomainWithoutWWW; } }, + {"topLevelDomain", F { return new FunctionTopLevelDomain; } }, + {"path", F { return new FunctionPath; } }, + {"queryString", F { return new FunctionQueryString; } }, + {"fragment", F { return new FunctionFragment; } }, + {"queryStringAndFragment", F { return new FunctionQueryStringAndFragment; } }, + {"extractURLParameter", F { return new FunctionExtractURLParameter; } }, + {"extractURLParameters", F { return new FunctionExtractURLParameters; } }, + {"extractURLParameterNames", F { return new FunctionExtractURLParameterNames; } }, + {"URLHierarchy", F { return new FunctionURLHierarchy; } }, + {"URLPathHierarchy", F { return new FunctionURLPathHierarchy; } }, + {"cutWWW", F { return new FunctionCutWWW; } }, + {"cutQueryString", F { return new FunctionCutQueryString; } }, + {"cutFragment", F { return new FunctionCutFragment; } }, + {"cutQueryStringAndFragment", F { return new FunctionCutQueryStringAndFragment; } }, + {"cutURLParameter", F { return new FunctionCutURLParameter; } }, - else if (name == "protocol") return new FunctionProtocol; - else if (name == "domain") return new FunctionDomain; - else if (name == "domainWithoutWWW") return new FunctionDomainWithoutWWW; - else if (name == "topLevelDomain") return new FunctionTopLevelDomain; - else if (name == "path") return new FunctionPath; - else if (name == "queryString") return new FunctionQueryString; - else if (name == "fragment") return new FunctionFragment; - else if (name == "queryStringAndFragment") return new FunctionQueryStringAndFragment; - else if (name == "extractURLParameter") return new FunctionExtractURLParameter; - else if (name == "extractURLParameters") return new FunctionExtractURLParameters; - else if (name == "extractURLParameterNames") return new FunctionExtractURLParameterNames; - else if (name == "URLHierarchy") return new FunctionURLHierarchy; - else if (name == "URLPathHierarchy") return new FunctionURLPathHierarchy; - else if (name == "cutWWW") return new FunctionCutWWW; - else if (name == "cutQueryString") return new FunctionCutQueryString; - else if (name == "cutFragment") return new FunctionCutFragment; - else if (name == "cutQueryStringAndFragment") return new FunctionCutQueryStringAndFragment; - else if (name == "cutURLParameter") return new FunctionCutURLParameter; + {"hostName", F { return new FunctionHostName; } }, + {"visibleWidth", F { return new FunctionVisibleWidth; } }, + {"bar", F { return new FunctionBar; } }, + {"toTypeName", F { return new FunctionToTypeName; } }, + {"blockSize", F { return new FunctionBlockSize; } }, + {"sleep", F { return new FunctionSleep; } }, + {"materialize", F { return new FunctionMaterialize; } }, + {"ignore", F { return new FunctionIgnore; } }, + {"arrayJoin", F { return new FunctionArrayJoin; } }, - else if (name == "hostName") return new FunctionHostName; - else if (name == "visibleWidth") return new FunctionVisibleWidth; - else if (name == "bar") return new FunctionBar; - else if (name == "toTypeName") return new FunctionToTypeName; - else if (name == "blockSize") return new FunctionBlockSize; - else if (name == "sleep") return new FunctionSleep; - else if (name == "materialize") return new FunctionMaterialize; - else if (name == "ignore") return new FunctionIgnore; - else if (name == "arrayJoin") return new FunctionArrayJoin; + {"tuple", F { return new FunctionTuple; } }, + {"tupleElement", F { return new FunctionTupleElement; } }, + {"in", F { return new FunctionIn(false, false); } }, + {"notIn", F { return new FunctionIn(true, false); } }, + {"globalIn", F { return new FunctionIn(false, true); } }, + {"globalNotIn", F { return new FunctionIn(true, true); } }, - else if (name == "tuple") return new FunctionTuple; - else if (name == "tupleElement") return new FunctionTupleElement; - else if (name == "in") return new FunctionIn(false, false); - else if (name == "notIn") return new FunctionIn(true, false); - else if (name == "globalIn") return new FunctionIn(false, true); - else if (name == "globalNotIn") return new FunctionIn(true, true); + {"array", F { return new FunctionArray; } }, + {"arrayElement", F { return new FunctionArrayElement; } }, + {"has", F { return new FunctionHas; } }, + {"indexOf", F { return new FunctionIndexOf; } }, + {"countEqual", F { return new FunctionCountEqual; } }, + {"arrayEnumerate", F { return new FunctionArrayEnumerate; } }, + {"arrayEnumerateUniq", F { return new FunctionArrayEnumerateUniq; } }, - else if (name == "array") return new FunctionArray; - else if (name == "arrayElement") return new FunctionArrayElement; - else if (name == "has") return new FunctionHas; - else if (name == "indexOf") return new FunctionIndexOf; - else if (name == "countEqual") return new FunctionCountEqual; - else if (name == "arrayEnumerate") return new FunctionArrayEnumerate; - else if (name == "arrayEnumerateUniq") return new FunctionArrayEnumerateUniq; + {"arrayMap", F { return new FunctionArrayMap; } }, + {"arrayFilter", F { return new FunctionArrayFilter; } }, + {"arrayCount", F { return new FunctionArrayCount; } }, + {"arrayExists", F { return new FunctionArrayExists; } }, + {"arrayAll", F { return new FunctionArrayAll; } }, + {"arraySum", F { return new FunctionArraySum; } }, - else if (name == "arrayMap") return new FunctionArrayMap; - else if (name == "arrayFilter") return new FunctionArrayFilter; - else if (name == "arrayCount") return new FunctionArrayCount; - else if (name == "arrayExists") return new FunctionArrayExists; - else if (name == "arrayAll") return new FunctionArrayAll; - else if (name == "arraySum") return new FunctionArraySum; + {"alphaTokens", F { return new FunctionAlphaTokens; } }, + {"splitByChar", F { return new FunctionSplitByChar; } }, + {"splitByString", F { return new FunctionSplitByString; } }, - else if (name == "alphaTokens") return new FunctionAlphaTokens; - else if (name == "splitByChar") return new FunctionSplitByChar; - else if (name == "splitByString") return new FunctionSplitByString; + {"if", F { return new FunctionIf; } }, - else if (name == "if") return new FunctionIf; + {"regionToCity", F { return new FunctionRegionToCity(context.getDictionaries().getRegionsHierarchies()); } }, + {"regionToArea", F { return new FunctionRegionToArea(context.getDictionaries().getRegionsHierarchies()); } }, + {"regionToCountry", F { return new FunctionRegionToCountry(context.getDictionaries().getRegionsHierarchies()); } }, + {"regionToContinent", F { return new FunctionRegionToContinent(context.getDictionaries().getRegionsHierarchies()); } }, + {"OSToRoot", F { return new FunctionOSToRoot(context.getDictionaries().getTechDataHierarchy()); } }, + {"SEToRoot", F { return new FunctionSEToRoot(context.getDictionaries().getTechDataHierarchy()); } }, + {"categoryToRoot", F { return new FunctionCategoryToRoot(context.getDictionaries().getCategoriesHierarchy()); } }, + {"categoryToSecondLevel", F { return new FunctionCategoryToSecondLevel(context.getDictionaries().getCategoriesHierarchy()); } }, + {"regionIn", F { return new FunctionRegionIn(context.getDictionaries().getRegionsHierarchies()); } }, + {"OSIn", F { return new FunctionOSIn(context.getDictionaries().getTechDataHierarchy()); } }, + {"SEIn", F { return new FunctionSEIn(context.getDictionaries().getTechDataHierarchy()); } }, + {"categoryIn", F { return new FunctionCategoryIn(context.getDictionaries().getCategoriesHierarchy()); } }, + {"regionHierarchy", F { return new FunctionRegionHierarchy(context.getDictionaries().getRegionsHierarchies()); } }, + {"OSHierarchy", F { return new FunctionOSHierarchy(context.getDictionaries().getTechDataHierarchy()); } }, + {"SEHierarchy", F { return new FunctionSEHierarchy(context.getDictionaries().getTechDataHierarchy()); } }, + {"categoryHierarchy", F { return new FunctionCategoryHierarchy(context.getDictionaries().getCategoriesHierarchy()); } }, + {"regionToName", F { return new FunctionRegionToName(context.getDictionaries().getRegionsNames()); } }, - else if (name == "regionToCity") return new FunctionRegionToCity(context.getDictionaries().getRegionsHierarchies()); - else if (name == "regionToArea") return new FunctionRegionToArea(context.getDictionaries().getRegionsHierarchies()); - else if (name == "regionToCountry") return new FunctionRegionToCountry(context.getDictionaries().getRegionsHierarchies()); - else if (name == "regionToContinent") return new FunctionRegionToContinent(context.getDictionaries().getRegionsHierarchies()); - else if (name == "OSToRoot") return new FunctionOSToRoot(context.getDictionaries().getTechDataHierarchy()); - else if (name == "SEToRoot") return new FunctionSEToRoot(context.getDictionaries().getTechDataHierarchy()); - else if (name == "categoryToRoot") return new FunctionCategoryToRoot(context.getDictionaries().getCategoriesHierarchy()); - else if (name == "categoryToSecondLevel") return new FunctionCategoryToSecondLevel(context.getDictionaries().getCategoriesHierarchy()); - else if (name == "regionIn") return new FunctionRegionIn(context.getDictionaries().getRegionsHierarchies()); - else if (name == "OSIn") return new FunctionOSIn(context.getDictionaries().getTechDataHierarchy()); - else if (name == "SEIn") return new FunctionSEIn(context.getDictionaries().getTechDataHierarchy()); - else if (name == "categoryIn") return new FunctionCategoryIn(context.getDictionaries().getCategoriesHierarchy()); - else if (name == "regionHierarchy") return new FunctionRegionHierarchy(context.getDictionaries().getRegionsHierarchies()); - else if (name == "OSHierarchy") return new FunctionOSHierarchy(context.getDictionaries().getTechDataHierarchy()); - else if (name == "SEHierarchy") return new FunctionSEHierarchy(context.getDictionaries().getTechDataHierarchy()); - else if (name == "categoryHierarchy") return new FunctionCategoryHierarchy(context.getDictionaries().getCategoriesHierarchy()); - else if (name == "regionToName") return new FunctionRegionToName(context.getDictionaries().getRegionsNames()); - - else if (name == "visitParamHas") return new FunctionVisitParamHas; - else if (name == "visitParamExtractUInt") return new FunctionVisitParamExtractUInt; - else if (name == "visitParamExtractInt") return new FunctionVisitParamExtractInt; - else if (name == "visitParamExtractFloat") return new FunctionVisitParamExtractFloat; - else if (name == "visitParamExtractBool") return new FunctionVisitParamExtractBool; - else if (name == "visitParamExtractRaw") return new FunctionVisitParamExtractRaw; - else if (name == "visitParamExtractString") return new FunctionVisitParamExtractString; + {"visitParamHas", F { return new FunctionVisitParamHas; } }, + {"visitParamExtractUInt", F { return new FunctionVisitParamExtractUInt; } }, + {"visitParamExtractInt", F { return new FunctionVisitParamExtractInt; } }, + {"visitParamExtractFloat", F { return new FunctionVisitParamExtractFloat; } }, + {"visitParamExtractBool", F { return new FunctionVisitParamExtractBool; } }, + {"visitParamExtractRaw", F { return new FunctionVisitParamExtractRaw; } }, + {"visitParamExtractString", F { return new FunctionVisitParamExtractString; } }, + }; + auto it = functions.find(name); + if (functions.end() != it) + return it->second(context); else throw Exception("Unknown function " + name, ErrorCodes::UNKNOWN_FUNCTION); } From 4a3d9082f727f21befd3e9cfc2dd269d6f1be1a5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 18 Aug 2014 09:45:41 +0400 Subject: [PATCH 088/127] dbms: improved performance of aggregate functions min, max, any, anyLast [#METR-2944]. --- .../AggregateFunctions/AggregateFunctionAny.h | 97 ---- .../AggregateFunctionAnyLast.h | 83 --- .../AggregateFunctionsMinMax.h | 132 ----- .../AggregateFunctionsMinMaxAny.h | 476 ++++++++++++++++++ dbms/include/DB/Core/StringRef.h | 17 +- .../AggregateFunctionFactory.cpp | 43 +- 6 files changed, 522 insertions(+), 326 deletions(-) delete mode 100644 dbms/include/DB/AggregateFunctions/AggregateFunctionAny.h delete mode 100644 dbms/include/DB/AggregateFunctions/AggregateFunctionAnyLast.h delete mode 100644 dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMax.h create mode 100644 dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMaxAny.h diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionAny.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionAny.h deleted file mode 100644 index 894ba99d39c..00000000000 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionAny.h +++ /dev/null @@ -1,97 +0,0 @@ -#pragma once - -#include -#include - -#include - - -namespace DB -{ - - -struct AggregateFunctionAnyData -{ - Field value; -}; - - -/// Берёт первое попавшееся значение -class AggregateFunctionAny final : public IUnaryAggregateFunction -{ -private: - DataTypePtr type; - -public: - String getName() const { return "any"; } - - DataTypePtr getReturnType() const - { - return type; - } - - void setArgument(const DataTypePtr & argument) - { - type = argument; - } - - - void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const - { - Data & d = data(place); - - if (!d.value.isNull()) - return; - column.get(row_num, d.value); - } - - void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const - { - Data & d = data(place); - - if (d.value.isNull()) - d.value = data(rhs).value; - } - - void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const - { - const Data & d = data(place); - - if (unlikely(d.value.isNull())) - { - writeBinary(false, buf); - } - else - { - writeBinary(true, buf); - type->serializeBinary(data(place).value, buf); - } - } - - void deserializeMerge(AggregateDataPtr place, ReadBuffer & buf) const - { - Data & d = data(place); - - bool is_not_null = false; - readBinary(is_not_null, buf); - - if (is_not_null) - { - Field tmp; - type->deserializeBinary(tmp, buf); - - if (d.value.isNull()) - d.value = tmp; - } - } - - void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const - { - if (unlikely(data(place).value.isNull())) - to.insertDefault(); - else - to.insert(data(place).value); - } -}; - -} diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionAnyLast.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionAnyLast.h deleted file mode 100644 index a0fecbc08e9..00000000000 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionAnyLast.h +++ /dev/null @@ -1,83 +0,0 @@ -#pragma once - -#include -#include - -#include - - -namespace DB -{ - - -struct AggregateFunctionAnyLastData -{ - Field value; -}; - - -/// Берёт последнее попавшееся значение -class AggregateFunctionAnyLast final : public IUnaryAggregateFunction -{ -private: - DataTypePtr type; - -public: - String getName() const { return "anyLast"; } - - DataTypePtr getReturnType() const - { - return type; - } - - void setArgument(const DataTypePtr & argument) - { - type = argument; - } - - - void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const - { - column.get(row_num, data(place).value); - } - - void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const - { - if (!data(rhs).value.isNull()) - data(place).value = data(rhs).value; - } - - void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const - { - const Data & d = data(place); - - if (unlikely(d.value.isNull())) - { - writeBinary(false, buf); - } - else - { - writeBinary(true, buf); - type->serializeBinary(data(place).value, buf); - } - } - - void deserializeMerge(AggregateDataPtr place, ReadBuffer & buf) const - { - bool is_not_null = false; - readBinary(is_not_null, buf); - - if (is_not_null) - type->deserializeBinary(data(place).value, buf); - } - - void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const - { - if (unlikely(data(place).value.isNull())) - to.insertDefault(); - else - to.insert(data(place).value); - } -}; - -} diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMax.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMax.h deleted file mode 100644 index c9e08b8580d..00000000000 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMax.h +++ /dev/null @@ -1,132 +0,0 @@ -#pragma once - -#include -#include - -#include - - -namespace DB -{ - - -struct AggregateFunctionMinTraits -{ - static bool better(const Field & lhs, const Field & rhs) { return lhs < rhs; } - static String name() { return "min"; } -}; - -struct AggregateFunctionMaxTraits -{ - static bool better(const Field & lhs, const Field & rhs) { return lhs > rhs; } - static String name() { return "max"; } -}; - - -struct AggregateFunctionsMinMaxData -{ - Field value; -}; - - -/// Берёт минимальное (или максимальное) значение. Если таких много - то первое попавшееся из них. -template -class AggregateFunctionsMinMax final : public IUnaryAggregateFunction > -{ -private: - typedef typename IAggregateFunctionHelper::Data Data; - DataTypePtr type; - -public: - String getName() const { return Traits::name(); } - - DataTypePtr getReturnType() const - { - return type; - } - - void setArgument(const DataTypePtr & argument) - { - type = argument; - } - - - void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const - { - Field value; - column.get(row_num, value); - Data & d = this->data(place); - - if (!d.value.isNull()) - { - if (Traits::better(value, d.value)) - d.value = value; - } - else - d.value = value; - } - - void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const - { - Data & d = this->data(place); - const Data & d_rhs = this->data(rhs); - - if (!d.value.isNull()) - { - if (Traits::better(d_rhs.value, d.value)) - d.value = d_rhs.value; - } - else - d.value = d_rhs.value; - } - - void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const - { - const Data & d = this->data(place); - - if (unlikely(d.value.isNull())) - { - writeBinary(false, buf); - } - else - { - writeBinary(true, buf); - type->serializeBinary(this->data(place).value, buf); - } - } - - void deserializeMerge(AggregateDataPtr place, ReadBuffer & buf) const - { - Data & d = this->data(place); - - bool is_not_null = false; - readBinary(is_not_null, buf); - - if (is_not_null) - { - if (!d.value.isNull()) - { - Field value_; - type->deserializeBinary(value_, buf); - if (Traits::better(value_, d.value)) - d.value = value_; - } - else - type->deserializeBinary(d.value, buf); - } - } - - void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const - { - if (unlikely(this->data(place).value.isNull())) - to.insertDefault(); - else - to.insert(this->data(place).value); - } -}; - - -typedef AggregateFunctionsMinMax AggregateFunctionMin; -typedef AggregateFunctionsMinMax AggregateFunctionMax; - -} diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMaxAny.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMaxAny.h new file mode 100644 index 00000000000..38f0b891509 --- /dev/null +++ b/dbms/include/DB/AggregateFunctions/AggregateFunctionsMinMaxAny.h @@ -0,0 +1,476 @@ +#pragma once + +#include +#include + +#include +#include + +#include + + +namespace DB +{ + +/** Агрегатные функции, запоминающие одно какое-либо переданное значение. + * Например, min, max, any, anyLast. + */ + + +/// Для числовых значений. +template +struct SingleValueDataFixed +{ + typedef SingleValueDataFixed Self; + + bool has_value = false; /// Надо запомнить, было ли передано хотя бы одно значение. Это нужно для AggregateFunctionIf. + T value; + + + bool has() const + { + return has_value; + } + + void insertResultInto(IColumn & to) const + { + if (has()) + static_cast &>(to).getData().push_back(value); + else + static_cast &>(to).insertDefault(); + } + + void write(WriteBuffer & buf, const IDataType & data_type) const + { + writeBinary(has(), buf); + if (has()) + writeBinary(value, buf); + } + + void read(ReadBuffer & buf, const IDataType & data_type) + { + readBinary(has_value, buf); + if (has()) + readBinary(value, buf); + } + + + void change(const IColumn & column, size_t row_num) + { + has_value = true; + value = static_cast &>(column).getData()[row_num]; + } + + void change(const Self & to) + { + has_value = true; + value = to.value; + } + + void changeFirstTime(const IColumn & column, size_t row_num) + { + if (!has()) + change(column, row_num); + } + + void changeFirstTime(const Self & to) + { + if (!has()) + change(to); + } + + void changeIfLess(const IColumn & column, size_t row_num) + { + if (!has() || static_cast &>(column).getData()[row_num] < value) + change(column, row_num); + } + + void changeIfLess(const Self & to) + { + if (!has() || to.value < value) + change(to); + } + + void changeIfGreater(const IColumn & column, size_t row_num) + { + if (!has() || static_cast &>(column).getData()[row_num] > value) + change(column, row_num); + } + + void changeIfGreater(const Self & to) + { + if (!has() || to.value > value) + change(to); + } +}; + + +/** Для строк. Короткие строки хранятся в самой структуре, а длинные выделяются отдельно. + * NOTE Могло бы подойти также для массивов чисел. + */ +struct __attribute__((__packed__)) SingleValueDataString +{ + typedef SingleValueDataString Self; + + Int32 size = -1; /// -1 обозначает, что значения нет. + + static constexpr Int32 AUTOMATIC_STORAGE_SIZE = 64; + static constexpr Int32 MAX_SMALL_STRING_SIZE = AUTOMATIC_STORAGE_SIZE - sizeof(size); + + union + { + char small_data[MAX_SMALL_STRING_SIZE]; /// Включая завершающий ноль. + char * large_data; + }; + + ~SingleValueDataString() + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + } + + bool has() const + { + return size >= 0; + } + + const char * getData() const + { + return size <= MAX_SMALL_STRING_SIZE ? small_data : large_data; + } + + StringRef getStringRef() const + { + return StringRef(getData(), size); + } + + void insertResultInto(IColumn & to) const + { + if (has()) + static_cast(to).insertDataWithTerminatingZero(getData(), size); + else + static_cast(to).insertDefault(); + } + + void write(WriteBuffer & buf, const IDataType & data_type) const + { + writeBinary(size, buf); + if (has()) + buf.write(getData(), size); + } + + void read(ReadBuffer & buf, const IDataType & data_type) + { + Int32 rhs_size; + readBinary(rhs_size, buf); + + if (rhs_size >= 0) + { + if (rhs_size <= MAX_SMALL_STRING_SIZE) + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + + size = rhs_size; + buf.read(small_data, size); + } + else + { + if (size < rhs_size) + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + + large_data = reinterpret_cast(malloc(rhs_size)); + } + + size = rhs_size; + buf.read(large_data, size); + } + } + else + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + size = rhs_size; + } + } + + + void changeImpl(StringRef value) + { + Int32 value_size = value.size; + + if (value_size <= MAX_SMALL_STRING_SIZE) + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + + size = value_size; + memcpy(small_data, value.data, size); + } + else + { + if (size < value_size) + { + if (size > MAX_SMALL_STRING_SIZE) + free(large_data); + + large_data = reinterpret_cast(malloc(value.size)); + } + + size = value_size; + memcpy(large_data, value.data, size); + } + } + + void change(const IColumn & column, size_t row_num) + { + changeImpl(static_cast(column).getDataAtWithTerminatingZero(row_num)); + } + + void change(const Self & to) + { + changeImpl(to.getStringRef()); + } + + void changeFirstTime(const IColumn & column, size_t row_num) + { + if (!has()) + change(column, row_num); + } + + void changeFirstTime(const Self & to) + { + if (!has()) + change(to); + } + + void changeIfLess(const IColumn & column, size_t row_num) + { + if (!has() || static_cast(column).getDataAtWithTerminatingZero(row_num) < getStringRef()) + change(column, row_num); + } + + void changeIfLess(const Self & to) + { + if (!has() || to.getStringRef() < getStringRef()) + change(to); + } + + void changeIfGreater(const IColumn & column, size_t row_num) + { + if (!has() || static_cast(column).getDataAtWithTerminatingZero(row_num) > getStringRef()) + change(column, row_num); + } + + void changeIfGreater(const Self & to) + { + if (!has() || to.getStringRef() > getStringRef()) + change(to); + } +}; + + +/// Для любых других типов значений. +struct SingleValueDataGeneric +{ + typedef SingleValueDataGeneric Self; + + Field value; + + bool has() const + { + return !value.isNull(); + } + + void insertResultInto(IColumn & to) const + { + if (has()) + to.insert(value); + else + to.insertDefault(); + } + + void write(WriteBuffer & buf, const IDataType & data_type) const + { + if (!value.isNull()) + { + writeBinary(true, buf); + data_type.serializeBinary(value, buf); + } + else + writeBinary(false, buf); + } + + void read(ReadBuffer & buf, const IDataType & data_type) + { + bool is_not_null; + readBinary(is_not_null, buf); + + if (is_not_null) + data_type.deserializeBinary(value, buf); + } + + void change(const IColumn & column, size_t row_num) + { + column.get(row_num, value); + } + + void change(const Self & to) + { + value = to.value; + } + + void changeFirstTime(const IColumn & column, size_t row_num) + { + if (!has()) + change(column, row_num); + } + + void changeFirstTime(const Self & to) + { + if (!has()) + change(to); + } + + void changeIfLess(const IColumn & column, size_t row_num) + { + if (!has()) + change(column, row_num); + else + { + Field new_value; + column.get(row_num, new_value); + if (new_value < value) + value = new_value; + } + } + + void changeIfLess(const Self & to) + { + if (!has() || to.value < value) + change(to); + } + + void changeIfGreater(const IColumn & column, size_t row_num) + { + if (!has()) + change(column, row_num); + else + { + Field new_value; + column.get(row_num, new_value); + if (new_value > value) + value = new_value; + } + } + + void changeIfGreater(const Self & to) + { + if (!has() || to.value > value) + change(to); + } +}; + + +/** То, чем отличаются друг от другая агрегатные функции min, max, any, anyLast + * (условием, при котором сохранённое значение заменяется на новое, + * а также, конечно, именем). + */ + +template +struct AggregateFunctionMinData : Data +{ + typedef AggregateFunctionMinData Self; + + void changeIfBetter(const IColumn & column, size_t row_num) { this->changeIfLess(column, row_num); } + void changeIfBetter(const Self & to) { this->changeIfLess(to); } + + static const char * name() { return "min"; } +}; + +template +struct AggregateFunctionMaxData : Data +{ + typedef AggregateFunctionMaxData Self; + + void changeIfBetter(const IColumn & column, size_t row_num) { this->changeIfGreater(column, row_num); } + void changeIfBetter(const Self & to) { this->changeIfGreater(to); } + + static const char * name() { return "max"; } +}; + +template +struct AggregateFunctionAnyData : Data +{ + typedef AggregateFunctionAnyData Self; + + void changeIfBetter(const IColumn & column, size_t row_num) { this->changeFirstTime(column, row_num); } + void changeIfBetter(const Self & to) { this->changeFirstTime(to); } + + static const char * name() { return "any"; } +}; + +template +struct AggregateFunctionAnyLastData : Data +{ + typedef AggregateFunctionAnyLastData Self; + + void changeIfBetter(const IColumn & column, size_t row_num) { this->change(column, row_num); } + void changeIfBetter(const Self & to) { this->change(to); } + + static const char * name() { return "anyLast"; } +}; + + +template +class AggregateFunctionsSingleValue final : public IUnaryAggregateFunction > +{ +private: + DataTypePtr type; + +public: + String getName() const { return Data::name(); } + + DataTypePtr getReturnType() const + { + return type; + } + + void setArgument(const DataTypePtr & argument) + { + type = argument; + } + + + void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const + { + this->data(place).changeIfBetter(column, row_num); + } + + void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const + { + this->data(place).changeIfBetter(this->data(rhs)); + } + + void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const + { + this->data(place).write(buf, *type.get()); + } + + void deserializeMerge(AggregateDataPtr place, ReadBuffer & buf) const + { + Data rhs; /// Для строчек не очень оптимально, так как может делаться одна лишняя аллокация. + rhs.read(buf, *type.get()); + + this->data(place).changeIfBetter(rhs); + } + + void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const + { + this->data(place).insertResultInto(to); + } +}; + +} diff --git a/dbms/include/DB/Core/StringRef.h b/dbms/include/DB/Core/StringRef.h index 0f0e21eb481..76cfc676805 100644 --- a/dbms/include/DB/Core/StringRef.h +++ b/dbms/include/DB/Core/StringRef.h @@ -26,7 +26,7 @@ struct StringRef typedef std::vector StringRefs; -inline bool operator==(StringRef lhs, StringRef rhs) +inline bool operator== (StringRef lhs, StringRef rhs) { /// Так почему-то быстрее, чем return lhs.size == rhs.size && 0 == memcmp(lhs.data, rhs.data, lhs.size); @@ -40,18 +40,21 @@ inline bool operator==(StringRef lhs, StringRef rhs) return true; } -inline bool operator!=(StringRef lhs, StringRef rhs) +inline bool operator!= (StringRef lhs, StringRef rhs) { return !(lhs == rhs); } -inline bool operator<(StringRef lhs, StringRef rhs) +inline bool operator< (StringRef lhs, StringRef rhs) { int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size)); - if (cmp == 0) - return lhs.size < rhs.size; - else - return cmp < 0; + return cmp < 0 || (cmp == 0 && lhs.size < rhs.size); +} + +inline bool operator> (StringRef lhs, StringRef rhs) +{ + int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size)); + return cmp > 0 || (cmp == 0 && lhs.size > rhs.size); } diff --git a/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp b/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp index e93f9fb9304..2229437e4a7 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -1,9 +1,7 @@ #include #include #include -#include -#include -#include +#include #include #include #include @@ -69,6 +67,7 @@ static IAggregateFunction * createWithNumericType(const IDataType & argument_typ return nullptr; } + template