From 23b9677879a2a0618b35032439650ec08e760c57 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 4 Sep 2020 08:46:58 +0300 Subject: [PATCH] Added a script to import git repository to ClickHouse --- src/Common/ShellCommand.cpp | 4 + src/IO/ReadBufferFromFile.cpp | 3 + src/IO/WriteBufferFromFile.cpp | 3 + utils/CMakeLists.txt | 1 + utils/git-to-clickhouse/CMakeLists.txt | 2 + utils/git-to-clickhouse/git-to-clickhouse.cpp | 638 ++++++++++++++++++ 6 files changed, 651 insertions(+) create mode 100644 utils/git-to-clickhouse/CMakeLists.txt create mode 100644 utils/git-to-clickhouse/git-to-clickhouse.cpp diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 53ab2301a0a..127f95fef06 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -186,6 +186,10 @@ int ShellCommand::tryWait() { wait_called = true; + in.close(); + out.close(); + err.close(); + LOG_TRACE(getLogger(), "Will wait for shell command pid {}", pid); int status = 0; diff --git a/src/IO/ReadBufferFromFile.cpp b/src/IO/ReadBufferFromFile.cpp index 40f69625e68..226615c757e 100644 --- a/src/IO/ReadBufferFromFile.cpp +++ b/src/IO/ReadBufferFromFile.cpp @@ -77,6 +77,9 @@ ReadBufferFromFile::~ReadBufferFromFile() void ReadBufferFromFile::close() { + if (fd < 0) + return; + if (0 != ::close(fd)) throw Exception("Cannot close file", ErrorCodes::CANNOT_CLOSE_FILE); diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp index b59a110edb4..4ade2e2c971 100644 --- a/src/IO/WriteBufferFromFile.cpp +++ b/src/IO/WriteBufferFromFile.cpp @@ -92,6 +92,9 @@ WriteBufferFromFile::~WriteBufferFromFile() /// Close file before destruction of object. void WriteBufferFromFile::close() { + if (fd < 0) + return; + next(); if (0 != ::close(fd)) diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 0dd95388e7d..dd03afe9fb8 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -29,6 +29,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (convert-month-partitioned-parts) add_subdirectory (checksum-for-compressed-block) add_subdirectory (wal-dump) + add_subdirectory (git-to-clickhouse) endif () if (ENABLE_CODE_QUALITY) diff --git a/utils/git-to-clickhouse/CMakeLists.txt b/utils/git-to-clickhouse/CMakeLists.txt new file mode 100644 index 00000000000..0e46b68d471 --- /dev/null +++ b/utils/git-to-clickhouse/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable (git-to-clickhouse git-to-clickhouse.cpp) +target_link_libraries(git-to-clickhouse PRIVATE dbms boost::program_options) diff --git a/utils/git-to-clickhouse/git-to-clickhouse.cpp b/utils/git-to-clickhouse/git-to-clickhouse.cpp new file mode 100644 index 00000000000..42920328ad7 --- /dev/null +++ b/utils/git-to-clickhouse/git-to-clickhouse.cpp @@ -0,0 +1,638 @@ +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + +enum class LineType +{ + Empty, + Comment, + Punct, + Code, +}; + +void writeText(LineType type, WriteBuffer & out) +{ + switch (type) + { + case LineType::Empty: writeString("Empty", out); break; + case LineType::Comment: writeString("Comment", out); break; + case LineType::Punct: writeString("Punct", out); break; + case LineType::Code: writeString("Code", out); break; + } +} + +struct LineChange +{ + int8_t sign{}; /// 1 if added, -1 if deleted + uint16_t line_number_old{}; + uint16_t line_number_new{}; + uint16_t hunk_num{}; /// ordinal number of hunk in diff, starting with 0 + uint16_t hunk_start_line_number_old{}; + uint16_t hunk_start_line_number_new{}; + std::string hunk_context; /// The context (like a line with function name) as it is calculated by git + std::string line; /// Line content without leading whitespaces + uint8_t indent{}; /// The number of leading whitespaces or tabs * 4 + LineType line_type{}; + + void setLineInfo(std::string full_line) + { + indent = 0; + + const char * pos = full_line.data(); + const char * end = pos + full_line.size(); + + while (pos < end) + { + if (*pos == ' ') + ++indent; + else if (*pos == '\t') + indent += 4; + else + break; + ++pos; + } + + line.assign(pos, end); + + if (pos == end) + { + line_type = LineType::Empty; + } + else if (pos + 1 < end + && ((pos[0] == '/' && pos[1] == '/') + || (pos[0] == '*' && pos[1] == ' '))) /// This is not precise. + { + line_type = LineType::Comment; + } + else + { + while (pos < end) + { + if (isAlphaNumericASCII(*pos)) + { + line_type = LineType::Code; + break; + } + ++pos; + } + if (pos == end) + line_type = LineType::Punct; + } + } + + void writeTextWithoutNewline(WriteBuffer & out) const + { + writeText(sign, out); + writeChar('\t', out); + writeText(line_number_old, out); + writeChar('\t', out); + writeText(line_number_new, out); + writeChar('\t', out); + writeText(hunk_num, out); + writeChar('\t', out); + writeText(hunk_start_line_number_old, out); + writeChar('\t', out); + writeText(hunk_start_line_number_new, out); + writeChar('\t', out); + writeText(hunk_context, out); + writeChar('\t', out); + writeText(line, out); + writeChar('\t', out); + writeText(indent, out); + writeChar('\t', out); + writeText(line_type, out); + } +}; + +using LineChanges = std::vector; + +enum class FileChangeType +{ + Add, + Delete, + Modify, + Rename, + Copy, + Type, +}; + +void writeText(FileChangeType type, WriteBuffer & out) +{ + switch (type) + { + case FileChangeType::Add: writeString("Add", out); break; + case FileChangeType::Delete: writeString("Delete", out); break; + case FileChangeType::Modify: writeString("Modify", out); break; + case FileChangeType::Rename: writeString("Rename", out); break; + case FileChangeType::Copy: writeString("Copy", out); break; + case FileChangeType::Type: writeString("Type", out); break; + } +} + +struct FileChange +{ + FileChangeType change_type{}; + std::string new_file_path; + std::string old_file_path; + uint16_t lines_added{}; + uint16_t lines_deleted{}; + uint16_t hunks_added{}; + uint16_t hunks_removed{}; + uint16_t hunks_changed{}; + + void writeTextWithoutNewline(WriteBuffer & out) const + { + writeText(change_type, out); + writeChar('\t', out); + writeText(new_file_path, out); + writeChar('\t', out); + writeText(old_file_path, out); + writeChar('\t', out); + writeText(lines_added, out); + writeChar('\t', out); + writeText(lines_deleted, out); + writeChar('\t', out); + writeText(hunks_added, out); + writeChar('\t', out); + writeText(hunks_removed, out); + writeChar('\t', out); + writeText(hunks_changed, out); + } +}; + +struct FileChangeAndLineChanges +{ + FileChange file_change; + LineChanges line_changes; +}; + +struct Commit +{ + std::string hash; + std::string author_name; + std::string author_email; + time_t time{}; + std::string message; + uint32_t files_added{}; + uint32_t files_deleted{}; + uint32_t files_renamed{}; + uint32_t files_modified{}; + uint32_t lines_added{}; + uint32_t lines_deleted{}; + uint32_t hunks_added{}; + uint32_t hunks_removed{}; + uint32_t hunks_changed{}; + + void writeTextWithoutNewline(WriteBuffer & out) const + { + writeText(hash, out); + writeChar('\t', out); + writeText(author_name, out); + writeChar('\t', out); + writeText(author_email, out); + writeChar('\t', out); + writeText(time, out); + writeChar('\t', out); + writeText(message, out); + writeChar('\t', out); + writeText(files_added, out); + writeChar('\t', out); + writeText(files_deleted, out); + writeChar('\t', out); + writeText(files_renamed, out); + writeChar('\t', out); + writeText(files_modified, out); + writeChar('\t', out); + writeText(lines_added, out); + writeChar('\t', out); + writeText(lines_deleted, out); + writeChar('\t', out); + writeText(hunks_added, out); + writeChar('\t', out); + writeText(hunks_removed, out); + writeChar('\t', out); + writeText(hunks_changed, out); + } +}; + + +void skipUntilWhitespace(ReadBuffer & buf) +{ + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\t', '\n', ' '>(buf.position(), buf.buffer().end()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == ' ') + return; + } +} + +void skipUntilNextLine(ReadBuffer & buf) +{ + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\n') + { + ++buf.position(); + return; + } + } +} + +void readStringUntilNextLine(std::string & s, ReadBuffer & buf) +{ + s.clear(); + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end()); + s.append(buf.position(), next_pos - buf.position()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\n') + { + ++buf.position(); + return; + } + } +} + + +struct Result +{ + WriteBufferFromFile commits{"commits.tsv"}; + WriteBufferFromFile file_changes{"file_changes.tsv"}; + WriteBufferFromFile line_changes{"line_changes.tsv"}; +}; + + +void processCommit(std::string hash, Result & result) +{ + std::string command = fmt::format( + "git show --raw --pretty='format:%at%x09%aN%x09%aE%x0A%s%x00' --patch --unified=0 {}", + hash); + + std::cerr << command << "\n"; + + auto commit_info = ShellCommand::execute(command); + auto & in = commit_info->out; + + Commit commit; + commit.hash = hash; + + readText(commit.time, in); + assertChar('\t', in); + readText(commit.author_name, in); + assertChar('\t', in); + readText(commit.author_email, in); + assertChar('\n', in); + readNullTerminated(commit.message, in); + + std::cerr << fmt::format("{}\t{}\n", toString(LocalDateTime(commit.time)), commit.message); + + if (!in.eof()) + assertChar('\n', in); + + /// File changes in form + /// :100644 100644 b90fe6bb94 3ffe4c380f M src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp + /// :100644 100644 828dedf6b5 828dedf6b5 R100 dbms/src/Functions/GeoUtils.h dbms/src/Functions/PolygonUtils.h + + std::map file_changes; + + while (checkChar(':', in)) + { + FileChange file_change; + + for (size_t i = 0; i < 4; ++i) + { + skipUntilWhitespace(in); + skipWhitespaceIfAny(in); + } + + char change_type; + readChar(change_type, in); + + int confidence; + switch (change_type) + { + case 'A': + file_change.change_type = FileChangeType::Add; + ++commit.files_added; + break; + case 'D': + file_change.change_type = FileChangeType::Delete; + ++commit.files_deleted; + break; + case 'M': + file_change.change_type = FileChangeType::Modify; + ++commit.files_modified; + break; + case 'R': + file_change.change_type = FileChangeType::Rename; + ++commit.files_renamed; + readText(confidence, in); + break; + case 'C': + file_change.change_type = FileChangeType::Copy; + readText(confidence, in); + break; + case 'T': + file_change.change_type = FileChangeType::Type; + break; + default: + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected file change type: {}", change_type); + } + + skipWhitespaceIfAny(in); + + if (change_type == 'R' || change_type == 'C') + { + readText(file_change.old_file_path, in); + skipWhitespaceIfAny(in); + readText(file_change.new_file_path, in); + } + else + { + readText(file_change.new_file_path, in); + } + + assertChar('\n', in); + + file_changes.emplace( + file_change.new_file_path, + FileChangeAndLineChanges{ file_change, {} }); + } + + if (!in.eof()) + { + assertChar('\n', in); + + /// Diffs for every file in form of + /// --- a/src/Storages/StorageReplicatedMergeTree.cpp + /// +++ b/src/Storages/StorageReplicatedMergeTree.cpp + /// @@ -1387,2 +1387 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) + /// - table_lock, entry.create_time, reserved_space, entry.deduplicate, + /// - entry.force_ttl); + /// + table_lock, entry.create_time, reserved_space, entry.deduplicate); + + std::string old_file_path; + std::string new_file_path; + FileChangeAndLineChanges * file_change_and_line_changes = nullptr; + LineChange line_change; + + while (!in.eof()) + { + if (checkString("@@ ", in)) + { + if (!file_change_and_line_changes) + { + auto file_name = new_file_path.empty() ? old_file_path : new_file_path; + auto it = file_changes.find(file_name); + if (file_changes.end() == it) + std::cerr << fmt::format("Warning: skipping bad file name {}\n", file_name); + else + file_change_and_line_changes = &it->second; + } + + if (file_change_and_line_changes) + { + uint16_t old_lines = 1; + uint16_t new_lines = 1; + + assertChar('-', in); + readText(line_change.hunk_start_line_number_old, in); + if (checkChar(',', in)) + readText(old_lines, in); + + assertString(" +", in); + readText(line_change.hunk_start_line_number_new, in); + if (checkChar(',', in)) + readText(new_lines, in); + + assertString(" @@", in); + if (checkChar(' ', in)) + readStringUntilNextLine(line_change.hunk_context, in); + else + assertChar('\n', in); + + ++line_change.hunk_num; + line_change.line_number_old = line_change.hunk_start_line_number_old; + line_change.line_number_new = line_change.hunk_start_line_number_new; + + if (old_lines && new_lines) + { + ++commit.hunks_changed; + ++file_change_and_line_changes->file_change.hunks_changed; + } + else if (old_lines) + { + ++commit.hunks_removed; + ++file_change_and_line_changes->file_change.hunks_removed; + } + else if (new_lines) + { + ++commit.hunks_added; + ++file_change_and_line_changes->file_change.hunks_added; + } + } + } + else if (checkChar('-', in)) + { + if (checkString("-- ", in)) + { + if (checkString("a/", in)) + { + readStringUntilNextLine(old_file_path, in); + line_change = LineChange{}; + file_change_and_line_changes = nullptr; + } + else if (checkString("/dev/null", in)) + { + old_file_path.clear(); + assertChar('\n', in); + line_change = LineChange{}; + file_change_and_line_changes = nullptr; + } + else + skipUntilNextLine(in); /// Actually it can be the line in diff. Skip it for simplicity. + } + else + { + if (file_change_and_line_changes) + { + ++commit.lines_deleted; + + line_change.sign = -1; + readStringUntilNextLine(line_change.line, in); + line_change.setLineInfo(line_change.line); + + file_change_and_line_changes->line_changes.push_back(line_change); + ++line_change.line_number_old; + } + } + } + else if (checkChar('+', in)) + { + if (checkString("++ ", in)) + { + if (checkString("b/", in)) + { + readStringUntilNextLine(new_file_path, in); + line_change = LineChange{}; + file_change_and_line_changes = nullptr; + } + else if (checkString("/dev/null", in)) + { + new_file_path.clear(); + assertChar('\n', in); + line_change = LineChange{}; + file_change_and_line_changes = nullptr; + } + else + skipUntilNextLine(in); /// Actually it can be the line in diff. Skip it for simplicity. + } + else + { + if (file_change_and_line_changes) + { + ++commit.lines_added; + + line_change.sign = 1; + readStringUntilNextLine(line_change.line, in); + line_change.setLineInfo(line_change.line); + + file_change_and_line_changes->line_changes.push_back(line_change); + ++line_change.line_number_new; + } + } + } + else + { + skipUntilNextLine(in); + } + } + } + + /// Write the result + + /// commits table + { + auto & out = result.commits; + + commit.writeTextWithoutNewline(out); + writeChar('\n', out); + } + + for (const auto & elem : file_changes) + { + const FileChange & file_change = elem.second.file_change; + + /// file_changes table + { + auto & out = result.file_changes; + + file_change.writeTextWithoutNewline(out); + writeChar('\t', out); + commit.writeTextWithoutNewline(out); + writeChar('\n', out); + } + + /// line_changes table + for (const auto & line_change : elem.second.line_changes) + { + auto & out = result.line_changes; + + line_change.writeTextWithoutNewline(out); + writeChar('\t', out); + file_change.writeTextWithoutNewline(out); + writeChar('\t', out); + commit.writeTextWithoutNewline(out); + writeChar('\n', out); + } + } +} + + +void processLog() +{ + Result result; + + std::string command = "git log --no-merges --pretty=%H"; + std::cerr << command << "\n"; + auto git_log = ShellCommand::execute(command); + + auto & in = git_log->out; + while (!in.eof()) + { + std::string hash; + readString(hash, in); + assertChar('\n', in); + + std::cerr << fmt::format("Processing commit {}\n", hash); + processCommit(std::move(hash), result); + } +} + + +} + +int main(int /*argc*/, char ** /*argv*/) +try +{ + using namespace DB; + +/* boost::program_options::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message"); + + boost::program_options::variables_map options; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); + + if (options.count("help") || argc != 2) + { + std::cout << "Usage: " << argv[0] << std::endl; + std::cout << desc << std::endl; + return 1; + }*/ + + processLog(); + return 0; +} +catch (...) +{ + std::cerr << DB::getCurrentExceptionMessage(true) << '\n'; + throw; +}