From 872d0a0fbe8684c1bc12d082822c487771bc3766 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 1 Feb 2022 02:07:48 +0300 Subject: [PATCH] Improve performance of format Regexp --- .../Formats/Impl/RegexpRowInputFormat.cpp | 2 +- .../Formats/Impl/RegexpRowInputFormat.h | 15 +- utils/CMakeLists.txt | 1 - .../CMakeLists.txt | 2 - .../convert-month-partitioned-parts/main.cpp | 148 ------------------ 5 files changed, 9 insertions(+), 159 deletions(-) delete mode 100644 utils/convert-month-partitioned-parts/CMakeLists.txt delete mode 100644 utils/convert-month-partitioned-parts/main.cpp diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 90db6f6f0ec..d793d2e0491 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -45,7 +45,7 @@ bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf) buf.makeContinuousMemoryFromCheckpointToPos(); buf.rollbackToCheckpoint(); - bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); + bool match = re2_st::RE2::FullMatchN(re2_st::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); if (!match && !skip_unmatched) throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index dffd2f82e02..e70595b4bb7 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -1,7 +1,7 @@ #pragma once -#include -#include +#include +#include #include #include #include @@ -12,6 +12,7 @@ #include #include + namespace DB { @@ -26,17 +27,17 @@ public: /// Return true if row was successfully parsed and row fields were extracted. bool parseRow(PeekableReadBuffer & buf); - re2::StringPiece getField(size_t index) { return matched_fields[index]; } + re2_st::StringPiece getField(size_t index) { return matched_fields[index]; } size_t getMatchedFieldsSize() const { return matched_fields.size(); } size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } private: - const RE2 regexp; + const re2_st::RE2 regexp; // The vector of fields extracted from line using regexp. - std::vector matched_fields; + std::vector matched_fields; // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). - std::vector re2_arguments; - std::vector re2_arguments_ptrs; + std::vector re2_arguments; + std::vector re2_arguments_ptrs; bool skip_unmatched; }; diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 7822f47ff88..51300472ed1 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -26,7 +26,6 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (zookeeper-adjust-block-numbers-to-parts) add_subdirectory (wikistat-loader) add_subdirectory (check-marks) - add_subdirectory (convert-month-partitioned-parts) add_subdirectory (checksum-for-compressed-block) add_subdirectory (db-generator) add_subdirectory (wal-dump) diff --git a/utils/convert-month-partitioned-parts/CMakeLists.txt b/utils/convert-month-partitioned-parts/CMakeLists.txt deleted file mode 100644 index ea6429a0610..00000000000 --- a/utils/convert-month-partitioned-parts/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable (convert-month-partitioned-parts main.cpp) -target_link_libraries(convert-month-partitioned-parts PRIVATE clickhouse_aggregate_functions dbms clickhouse_parsers boost::program_options) diff --git a/utils/convert-month-partitioned-parts/main.cpp b/utils/convert-month-partitioned-parts/main.cpp deleted file mode 100644 index a6829d79726..00000000000 --- a/utils/convert-month-partitioned-parts/main.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int DIRECTORY_ALREADY_EXISTS; - extern const int BAD_DATA_PART_NAME; - extern const int NO_FILE_IN_DATA_PART; -} - -void run(String part_path, String date_column, String dest_path) -{ - std::shared_ptr disk = std::make_shared("local", "/", 0); - auto old_part_path = Poco::Path::forDirectory(part_path); - const String & old_part_name = old_part_path.directory(old_part_path.depth() - 1); - String old_part_path_str = old_part_path.toString(); - - auto part_info = MergeTreePartInfo::fromPartName(old_part_name, MergeTreeDataFormatVersion(0)); - String new_part_name = part_info.getPartName(); - - auto new_part_path = Poco::Path::forDirectory(dest_path); - new_part_path.pushDirectory(new_part_name); - if (Poco::File(new_part_path).exists()) - throw Exception("Destination part directory `" + new_part_path.toString() + "` already exists", - ErrorCodes::DIRECTORY_ALREADY_EXISTS); - - DayNum min_date; - DayNum max_date; - MergeTreePartInfo::parseMinMaxDatesFromPartName(old_part_name, min_date, max_date); - - const auto & time_zone = DateLUT::instance(); - UInt32 yyyymm = time_zone.toNumYYYYMM(min_date); - if (yyyymm != time_zone.toNumYYYYMM(max_date)) - throw Exception("Part " + old_part_name + " spans different months", - ErrorCodes::BAD_DATA_PART_NAME); - - ReadBufferFromFile checksums_in(old_part_path_str + "checksums.txt", 4096); - MergeTreeDataPartChecksums checksums; - checksums.read(checksums_in); - - auto date_col_checksum_it = checksums.files.find(date_column + ".bin"); - if (date_col_checksum_it == checksums.files.end()) - throw Exception("Couldn't find checksum for the date column .bin file `" + date_column + ".bin`", - ErrorCodes::NO_FILE_IN_DATA_PART); - - UInt64 rows = date_col_checksum_it->second.uncompressed_size / DataTypeDate().getSizeOfValueInMemory(); - - auto new_tmp_part_path = Poco::Path::forDirectory(dest_path); - new_tmp_part_path.pushDirectory("tmp_convert_" + new_part_name); - String new_tmp_part_path_str = new_tmp_part_path.toString(); - try - { - Poco::File(new_tmp_part_path).remove(/* recursive = */ true); - } - catch (const Poco::FileNotFoundException &) - { - /// If the file is already deleted, do nothing. - } - localBackup(disk, old_part_path.toString(), new_tmp_part_path.toString(), {}); - - WriteBufferFromFile count_out(new_tmp_part_path_str + "count.txt", 4096); - HashingWriteBuffer count_out_hashing(count_out); - writeIntText(rows, count_out_hashing); - count_out_hashing.next(); - checksums.files["count.txt"].file_size = count_out_hashing.count(); - checksums.files["count.txt"].file_hash = count_out_hashing.getHash(); - - IMergeTreeDataPart::MinMaxIndex minmax_idx(min_date, max_date); - Names minmax_idx_columns = {date_column}; - DataTypes minmax_idx_column_types = {std::make_shared()}; - minmax_idx.store(minmax_idx_columns, minmax_idx_column_types, disk, new_tmp_part_path_str, checksums); - - Block partition_key_sample{{nullptr, std::make_shared(), makeASTFunction("toYYYYMM", std::make_shared(date_column))->getColumnName()}}; - - MergeTreePartition partition(yyyymm); - partition.store(partition_key_sample, disk, new_tmp_part_path_str, checksums); - String partition_id = partition.getID(partition_key_sample); - - Poco::File(new_tmp_part_path_str + "checksums.txt").setWriteable(); - WriteBufferFromFile checksums_out(new_tmp_part_path_str + "checksums.txt", 4096); - checksums.write(checksums_out); - checksums_in.close(); - checksums_out.close(); - - Poco::File(new_tmp_part_path).renameTo(new_part_path.toString()); -} - -} - -int main(int argc, char ** argv) -try -{ - boost::program_options::options_description desc("Allowed options"); - desc.add_options() - ("help,h", "produce help message") - ("part", boost::program_options::value()->required(), - "part directory to convert") - ("date-column", boost::program_options::value()->required(), - "name of the date column") - ("to", boost::program_options::value()->required(), - "destination directory") - ; - - boost::program_options::variables_map options; - boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); - - if (options.count("help") || options.size() < 3) - { - std::cout - << "Convert a MergeTree part from the old-style month-partitioned table " - << "(e.g. 20140317_20140323_2_2_0) to the format suitable for ATTACH'ing to a custom-partitioned " - << "table (201403_2_2_0)." << std::endl << std::endl; - std::cout << desc << std::endl; - return 1; - } - - auto part_path = options.at("part").as(); - auto date_column = options.at("date-column").as(); - auto dest_path = options.at("to").as(); - - DB::run(part_path, date_column, dest_path); - - return 0; -} -catch (...) -{ - std::cerr << DB::getCurrentExceptionMessage(true) << '\n'; - throw; -}