ClickHouse/utils/git-to-clickhouse/git-to-clickhouse.cpp

1183 lines
36 KiB
C++
Raw Normal View History

#include <cstdint>
#include <string>
#include <vector>
2020-09-06 00:09:40 +00:00
#include <algorithm>
#include <cctype>
#include <unordered_set>
2020-09-06 06:38:48 +00:00
#include <unordered_map>
2020-09-06 04:02:17 +00:00
#include <list>
#include <thread>
2020-09-06 00:47:00 +00:00
#include <filesystem>
2020-09-06 00:09:40 +00:00
#include <re2_st/re2.h>
#include <boost/program_options.hpp>
#include <Common/Exception.h>
2020-09-06 06:38:48 +00:00
#include <Common/SipHash.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/ShellCommand.h>
#include <common/find_symbols.h>
#include <IO/copyData.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteBufferFromFileDescriptor.h>
2020-09-06 00:09:40 +00:00
static constexpr auto documentation = R"(
Prepare the database by executing the following queries:
DROP DATABASE IF EXISTS git;
CREATE DATABASE git;
CREATE TABLE git.commits
(
hash String,
author LowCardinality(String),
time DateTime,
message String,
files_added UInt32,
files_deleted UInt32,
files_renamed UInt32,
files_modified UInt32,
lines_added UInt32,
lines_deleted UInt32,
hunks_added UInt32,
hunks_removed UInt32,
hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
CREATE TABLE git.file_changes
(
change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
2020-09-06 00:47:00 +00:00
path LowCardinality(String),
old_path LowCardinality(String),
file_extension LowCardinality(String),
2020-09-06 00:09:40 +00:00
lines_added UInt32,
lines_deleted UInt32,
hunks_added UInt32,
hunks_removed UInt32,
hunks_changed UInt32,
commit_hash String,
author LowCardinality(String),
time DateTime,
commit_message String,
commit_files_added UInt32,
commit_files_deleted UInt32,
commit_files_renamed UInt32,
commit_files_modified UInt32,
commit_lines_added UInt32,
commit_lines_deleted UInt32,
commit_hunks_added UInt32,
commit_hunks_removed UInt32,
commit_hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
CREATE TABLE git.line_changes
(
sign Int8,
2020-09-06 00:09:40 +00:00
line_number_old UInt32,
line_number_new UInt32,
hunk_num UInt32,
hunk_start_line_number_old UInt32,
hunk_start_line_number_new UInt32,
2020-09-06 04:02:17 +00:00
hunk_lines_added UInt32,
hunk_lines_deleted UInt32,
hunk_context LowCardinality(String),
line LowCardinality(String),
indent UInt8,
line_type Enum('Empty' = 0, 'Comment' = 1, 'Punct' = 2, 'Code' = 3),
2020-09-06 04:02:17 +00:00
prev_commit_hash String,
prev_author LowCardinality(String),
prev_time DateTime,
file_change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
2020-09-06 00:47:00 +00:00
path LowCardinality(String),
old_path LowCardinality(String),
file_extension LowCardinality(String),
2020-09-06 00:09:40 +00:00
file_lines_added UInt32,
file_lines_deleted UInt32,
file_hunks_added UInt32,
file_hunks_removed UInt32,
file_hunks_changed UInt32,
commit_hash String,
author LowCardinality(String),
time DateTime,
commit_message String,
commit_files_added UInt32,
commit_files_deleted UInt32,
commit_files_renamed UInt32,
commit_files_modified UInt32,
commit_lines_added UInt32,
commit_lines_deleted UInt32,
commit_hunks_added UInt32,
commit_hunks_removed UInt32,
commit_hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
2020-09-06 00:09:40 +00:00
Insert the data with the following commands:
clickhouse-client --query "INSERT INTO git.commits FORMAT TSV" < commits.tsv
clickhouse-client --query "INSERT INTO git.file_changes FORMAT TSV" < file_changes.tsv
clickhouse-client --query "INSERT INTO git.line_changes FORMAT TSV" < line_changes.tsv
2020-09-06 00:09:40 +00:00
)";
2020-09-06 00:09:40 +00:00
namespace po = boost::program_options;
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
2020-09-06 04:02:17 +00:00
struct Commit
{
std::string hash;
std::string author;
LocalDateTime time{};
std::string message;
uint32_t files_added{};
uint32_t files_deleted{};
uint32_t files_renamed{};
uint32_t files_modified{};
uint32_t lines_added{};
uint32_t lines_deleted{};
uint32_t hunks_added{};
uint32_t hunks_removed{};
uint32_t hunks_changed{};
void writeTextWithoutNewline(WriteBuffer & out) const
{
writeText(hash, out);
writeChar('\t', out);
writeText(author, out);
writeChar('\t', out);
writeText(time, out);
writeChar('\t', out);
writeText(message, out);
writeChar('\t', out);
writeText(files_added, out);
writeChar('\t', out);
writeText(files_deleted, out);
writeChar('\t', out);
writeText(files_renamed, out);
writeChar('\t', out);
writeText(files_modified, out);
writeChar('\t', out);
writeText(lines_added, out);
writeChar('\t', out);
writeText(lines_deleted, out);
writeChar('\t', out);
writeText(hunks_added, out);
writeChar('\t', out);
writeText(hunks_removed, out);
writeChar('\t', out);
writeText(hunks_changed, out);
}
};
enum class FileChangeType
{
Add,
Delete,
Modify,
Rename,
Copy,
Type,
};
void writeText(FileChangeType type, WriteBuffer & out)
{
switch (type)
{
case FileChangeType::Add: writeString("Add", out); break;
case FileChangeType::Delete: writeString("Delete", out); break;
case FileChangeType::Modify: writeString("Modify", out); break;
case FileChangeType::Rename: writeString("Rename", out); break;
case FileChangeType::Copy: writeString("Copy", out); break;
case FileChangeType::Type: writeString("Type", out); break;
}
}
struct FileChange
{
FileChangeType change_type{};
std::string path;
std::string old_path;
std::string file_extension;
uint32_t lines_added{};
uint32_t lines_deleted{};
uint32_t hunks_added{};
uint32_t hunks_removed{};
uint32_t hunks_changed{};
void writeTextWithoutNewline(WriteBuffer & out) const
{
writeText(change_type, out);
writeChar('\t', out);
writeText(path, out);
writeChar('\t', out);
writeText(old_path, out);
writeChar('\t', out);
writeText(file_extension, out);
writeChar('\t', out);
writeText(lines_added, out);
writeChar('\t', out);
writeText(lines_deleted, out);
writeChar('\t', out);
writeText(hunks_added, out);
writeChar('\t', out);
writeText(hunks_removed, out);
writeChar('\t', out);
writeText(hunks_changed, out);
}
};
enum class LineType
{
Empty,
Comment,
Punct,
Code,
};
void writeText(LineType type, WriteBuffer & out)
{
switch (type)
{
case LineType::Empty: writeString("Empty", out); break;
case LineType::Comment: writeString("Comment", out); break;
case LineType::Punct: writeString("Punct", out); break;
case LineType::Code: writeString("Code", out); break;
}
}
struct LineChange
{
int8_t sign{}; /// 1 if added, -1 if deleted
2020-09-06 00:09:40 +00:00
uint32_t line_number_old{};
uint32_t line_number_new{};
uint32_t hunk_num{}; /// ordinal number of hunk in diff, starting with 0
uint32_t hunk_start_line_number_old{};
uint32_t hunk_start_line_number_new{};
2020-09-06 04:02:17 +00:00
uint32_t hunk_lines_added{};
uint32_t hunk_lines_deleted{};
std::string hunk_context; /// The context (like a line with function name) as it is calculated by git
std::string line; /// Line content without leading whitespaces
uint8_t indent{}; /// The number of leading whitespaces or tabs * 4
LineType line_type{};
2020-09-07 00:17:26 +00:00
/// Information from the history (blame).
2020-09-06 04:02:17 +00:00
std::string prev_commit_hash;
std::string prev_author;
LocalDateTime prev_time{};
2020-09-07 00:17:26 +00:00
/** Classify line to empty / code / comment / single punctuation char.
* Very rough and mostly suitable for our C++ style.
*/
void setLineInfo(std::string full_line)
{
indent = 0;
const char * pos = full_line.data();
const char * end = pos + full_line.size();
while (pos < end)
{
if (*pos == ' ')
++indent;
else if (*pos == '\t')
indent += 4;
else
break;
++pos;
}
line.assign(pos, end);
if (pos == end)
{
line_type = LineType::Empty;
}
else if (pos + 1 < end
2020-09-07 00:17:26 +00:00
&& ((pos[0] == '/' && (pos[1] == '/' || pos[1] == '*'))
|| (pos[0] == '*' && pos[1] == ' ') /// This is not precise.
|| (pos[0] == '#' && pos[1] == ' ')))
{
line_type = LineType::Comment;
}
else
{
while (pos < end)
{
if (isAlphaNumericASCII(*pos))
{
line_type = LineType::Code;
break;
}
++pos;
}
if (pos == end)
line_type = LineType::Punct;
}
}
void writeTextWithoutNewline(WriteBuffer & out) const
{
writeText(sign, out);
writeChar('\t', out);
writeText(line_number_old, out);
writeChar('\t', out);
writeText(line_number_new, out);
writeChar('\t', out);
writeText(hunk_num, out);
writeChar('\t', out);
writeText(hunk_start_line_number_old, out);
writeChar('\t', out);
writeText(hunk_start_line_number_new, out);
writeChar('\t', out);
2020-09-06 04:02:17 +00:00
writeText(hunk_lines_added, out);
writeChar('\t', out);
writeText(hunk_lines_deleted, out);
writeChar('\t', out);
writeText(hunk_context, out);
writeChar('\t', out);
writeText(line, out);
writeChar('\t', out);
writeText(indent, out);
writeChar('\t', out);
writeText(line_type, out);
writeChar('\t', out);
2020-09-06 04:02:17 +00:00
writeText(prev_commit_hash, out);
writeChar('\t', out);
2020-09-06 04:02:17 +00:00
writeText(prev_author, out);
2020-09-06 00:47:00 +00:00
writeChar('\t', out);
2020-09-06 04:02:17 +00:00
writeText(prev_time, out);
}
};
2020-09-06 04:02:17 +00:00
using LineChanges = std::vector<LineChange>;
2020-09-07 00:17:26 +00:00
struct FileDiff
{
FileDiff(FileChange file_change_) : file_change(file_change_) {}
FileChange file_change;
LineChanges line_changes;
};
using CommitDiff = std::map<std::string /* path */, FileDiff>;
/** Parsing helpers */
void skipUntilWhitespace(ReadBuffer & buf)
{
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\t', '\n', ' '>(buf.position(), buf.buffer().end());
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == ' ')
return;
}
}
void skipUntilNextLine(ReadBuffer & buf)
{
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end());
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\n')
{
++buf.position();
return;
}
}
}
void readStringUntilNextLine(std::string & s, ReadBuffer & buf)
{
s.clear();
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end());
s.append(buf.position(), next_pos - buf.position());
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\n')
{
++buf.position();
return;
}
}
}
2020-09-07 00:17:26 +00:00
/** Writes the resulting tables to files that can be imported to ClickHouse.
*/
struct ResultWriter
{
WriteBufferFromFile commits{"commits.tsv"};
WriteBufferFromFile file_changes{"file_changes.tsv"};
WriteBufferFromFile line_changes{"line_changes.tsv"};
2020-09-07 00:17:26 +00:00
void appendCommit(const Commit & commit, const CommitDiff & files)
{
/// commits table
{
auto & out = commits;
commit.writeTextWithoutNewline(out);
writeChar('\n', out);
}
for (const auto & elem : files)
{
const FileChange & file_change = elem.second.file_change;
/// file_changes table
{
auto & out = file_changes;
file_change.writeTextWithoutNewline(out);
writeChar('\t', out);
commit.writeTextWithoutNewline(out);
writeChar('\n', out);
}
/// line_changes table
for (const auto & line_change : elem.second.line_changes)
{
auto & out = line_changes;
line_change.writeTextWithoutNewline(out);
writeChar('\t', out);
file_change.writeTextWithoutNewline(out);
writeChar('\t', out);
commit.writeTextWithoutNewline(out);
writeChar('\n', out);
}
}
}
};
2020-09-07 00:17:26 +00:00
/** See description in "main".
*/
2020-09-06 00:09:40 +00:00
struct Options
{
bool skip_commits_without_parents = true;
2020-09-06 06:38:48 +00:00
bool skip_commits_with_duplicate_diffs = true;
2020-09-06 04:02:17 +00:00
size_t threads = 1;
2020-09-06 00:09:40 +00:00
std::optional<re2_st::RE2> skip_paths;
2020-09-06 06:38:48 +00:00
std::optional<re2_st::RE2> skip_commits_with_messages;
2020-09-06 00:09:40 +00:00
std::unordered_set<std::string> skip_commits;
2020-09-06 04:02:17 +00:00
std::optional<size_t> diff_size_limit;
2020-09-06 23:24:31 +00:00
std::string stop_after_commit;
2020-09-06 00:09:40 +00:00
Options(const po::variables_map & options)
{
skip_commits_without_parents = options["skip-commits-without-parents"].as<bool>();
2020-09-06 06:38:48 +00:00
skip_commits_with_duplicate_diffs = options["skip-commits-with-duplicate-diffs"].as<bool>();
2020-09-06 04:02:17 +00:00
threads = options["threads"].as<size_t>();
2020-09-06 00:09:40 +00:00
if (options.count("skip-paths"))
{
skip_paths.emplace(options["skip-paths"].as<std::string>());
}
2020-09-06 06:38:48 +00:00
if (options.count("skip-commits-with-messages"))
{
skip_commits_with_messages.emplace(options["skip-commits-with-messages"].as<std::string>());
}
2020-09-06 00:09:40 +00:00
if (options.count("skip-commit"))
{
auto vec = options["skip-commit"].as<std::vector<std::string>>();
skip_commits.insert(vec.begin(), vec.end());
}
2020-09-06 04:02:17 +00:00
if (options.count("diff-size-limit"))
{
diff_size_limit = options["diff-size-limit"].as<size_t>();
}
2020-09-06 23:24:31 +00:00
if (options.count("stop-after-commit"))
{
stop_after_commit = options["stop-after-commit"].as<std::string>();
}
2020-09-06 00:09:40 +00:00
}
};
2020-09-07 00:17:26 +00:00
/** Rough snapshot of repository calculated by application of diffs. It's used to calculate blame info.
* Represented by a list of lines. For every line it contains information about commit that modified this line the last time.
*
* Note that there are many cases when this info may become incorrect.
* The first reason is that git history is non-linear but we form this snapshot by application of commit diffs in some order
* that cannot give us correct results even theoretically.
* The second reason is that we don't process merge commits. But merge commits may contain differences for conflict resolution.
*
* We expect that the information will be mostly correct for the purpose of analytics.
* So, it can provide the expected "blame" info for the most of the lines.
*/
2020-09-06 04:02:17 +00:00
struct FileBlame
2020-09-06 00:47:00 +00:00
{
2020-09-06 04:02:17 +00:00
using Lines = std::list<Commit>;
Lines lines;
2020-09-07 00:17:26 +00:00
/// We walk through this list adding or removing lines.
2020-09-06 04:02:17 +00:00
Lines::iterator it;
size_t current_idx = 1;
FileBlame()
{
it = lines.begin();
}
2020-09-06 00:47:00 +00:00
2020-09-07 00:17:26 +00:00
/// This is important when file was copied or renamed.
2020-09-06 04:02:17 +00:00
FileBlame & operator=(const FileBlame & rhs)
{
lines = rhs.lines;
it = lines.begin();
current_idx = 1;
return *this;
}
2020-09-06 00:47:00 +00:00
2020-09-06 04:02:17 +00:00
FileBlame(const FileBlame & rhs)
{
*this = rhs;
}
2020-09-06 00:47:00 +00:00
2020-09-07 00:17:26 +00:00
/// Move iterator to requested line or stop at the end.
2020-09-06 04:02:17 +00:00
void walk(uint32_t num)
{
2020-09-06 06:38:48 +00:00
while (current_idx < num && it != lines.end())
2020-09-06 04:02:17 +00:00
{
2020-09-06 06:38:48 +00:00
++current_idx;
++it;
2020-09-06 04:02:17 +00:00
}
2020-09-06 06:38:48 +00:00
while (current_idx > num)
2020-09-06 04:02:17 +00:00
{
--current_idx;
--it;
}
}
const Commit * find(uint32_t num)
{
walk(num);
2020-09-06 06:38:48 +00:00
// std::cerr << "current_idx: " << current_idx << ", num: " << num << "\n";
2020-09-06 04:02:17 +00:00
if (current_idx == num && it != lines.end())
return &*it;
return {};
}
void addLine(uint32_t num, Commit commit)
{
walk(num);
2020-09-07 00:17:26 +00:00
/// If the inserted line is over the end of file, we insert empty lines before it.
2020-09-06 04:02:17 +00:00
while (it == lines.end() && current_idx < num)
{
lines.emplace_back();
++current_idx;
}
2020-09-06 06:38:48 +00:00
it = lines.insert(it, commit);
2020-09-06 04:02:17 +00:00
}
void removeLine(uint32_t num)
{
2020-09-06 06:38:48 +00:00
// std::cerr << "Removing line " << num << ", current_idx: " << current_idx << "\n";
2020-09-06 04:02:17 +00:00
walk(num);
2020-09-06 06:38:48 +00:00
if (current_idx == num && it != lines.end())
2020-09-06 04:02:17 +00:00
it = lines.erase(it);
}
};
2020-09-07 00:17:26 +00:00
/// All files with their blame info. When file is renamed, we also rename it in snapshot.
2020-09-06 04:02:17 +00:00
using Snapshot = std::map<std::string /* path */, FileBlame>;
2020-09-07 00:17:26 +00:00
/** Enrich the line changes data with the history info from the snapshot
* - the author, time and commit of the previous change to every found line (blame).
* And update the snapshot.
*/
void updateSnapshot(Snapshot & snapshot, const Commit & commit, CommitDiff & file_changes)
{
2020-09-07 00:17:26 +00:00
/// Renames and copies.
for (auto & elem : file_changes)
{
auto & file = elem.second.file_change;
if (file.path != file.old_path)
snapshot[file.path] = snapshot[file.old_path];
}
2020-09-06 04:02:17 +00:00
2020-09-07 00:17:26 +00:00
for (auto & elem : file_changes)
{
// std::cerr << elem.first << "\n";
2020-09-07 00:17:26 +00:00
FileBlame & file_snapshot = snapshot[elem.first];
std::unordered_map<uint32_t, Commit> deleted_lines;
2020-09-06 06:38:48 +00:00
2020-09-07 00:17:26 +00:00
/// Obtain blame info from previous state of the snapshot
2020-09-07 00:17:26 +00:00
for (auto & line_change : elem.second.line_changes)
{
if (line_change.sign == -1)
{
if (const Commit * prev_commit = file_snapshot.find(line_change.line_number_old);
prev_commit && prev_commit->time <= commit.time)
{
line_change.prev_commit_hash = prev_commit->hash;
line_change.prev_author = prev_commit->author;
line_change.prev_time = prev_commit->time;
deleted_lines[line_change.line_number_old] = *prev_commit;
}
else
{
// std::cerr << "Did not find line " << line_change.line_number_old << " from file " << elem.first << ": " << line_change.line << "\n";
}
}
else if (line_change.sign == 1)
{
uint32_t this_line_in_prev_commit = line_change.hunk_start_line_number_old
+ (line_change.line_number_new - line_change.hunk_start_line_number_new);
2020-09-07 00:17:26 +00:00
if (deleted_lines.count(this_line_in_prev_commit))
{
const auto & prev_commit = deleted_lines[this_line_in_prev_commit];
if (prev_commit.time <= commit.time)
{
line_change.prev_commit_hash = prev_commit.hash;
line_change.prev_author = prev_commit.author;
line_change.prev_time = prev_commit.time;
}
}
}
}
2020-09-07 00:17:26 +00:00
/// Update the snapshot
2020-09-07 00:17:26 +00:00
for (const auto & line_change : elem.second.line_changes)
{
if (line_change.sign == -1)
{
file_snapshot.removeLine(line_change.line_number_new);
}
else if (line_change.sign == 1)
{
file_snapshot.addLine(line_change.line_number_new, commit);
}
}
}
}
2020-09-06 06:38:48 +00:00
2020-09-06 00:09:40 +00:00
2020-09-07 00:17:26 +00:00
/** Deduplication of commits with identical diffs.
*/
using DiffHashes = std::unordered_set<UInt128>;
2020-09-06 00:09:40 +00:00
2020-09-07 00:17:26 +00:00
UInt128 diffHash(const CommitDiff & file_changes)
{
SipHash hasher;
for (auto & elem : file_changes)
2020-09-06 00:09:40 +00:00
{
2020-09-07 00:17:26 +00:00
hasher.update(elem.second.file_change.change_type);
hasher.update(elem.second.file_change.old_path.size());
hasher.update(elem.second.file_change.old_path);
hasher.update(elem.second.file_change.path.size());
hasher.update(elem.second.file_change.path);
hasher.update(elem.second.line_changes.size());
for (auto & line_change : elem.second.line_changes)
{
hasher.update(line_change.sign);
hasher.update(line_change.line_number_old);
hasher.update(line_change.line_number_new);
hasher.update(line_change.indent);
hasher.update(line_change.line.size());
hasher.update(line_change.line);
}
2020-09-06 00:09:40 +00:00
}
2020-09-07 00:17:26 +00:00
UInt128 hash_of_diff;
hasher.get128(hash_of_diff.low, hash_of_diff.high);
2020-09-07 00:17:26 +00:00
return hash_of_diff;
}
2020-09-07 00:17:26 +00:00
/** File changes in form
* :100644 100644 b90fe6bb94 3ffe4c380f M src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
* :100644 100644 828dedf6b5 828dedf6b5 R100 dbms/src/Functions/GeoUtils.h dbms/src/Functions/PolygonUtils.h
* according to the output of 'git show --raw'
*/
void processFileChanges(
ReadBuffer & in,
const Options & options,
Commit & commit,
CommitDiff & file_changes)
{
while (checkChar(':', in))
{
FileChange file_change;
2020-09-07 00:17:26 +00:00
/// We don't care about file mode and content hashes.
for (size_t i = 0; i < 4; ++i)
{
skipUntilWhitespace(in);
skipWhitespaceIfAny(in);
}
char change_type;
readChar(change_type, in);
2020-09-07 00:17:26 +00:00
/// For rename and copy there is a number called "score". We ignore it.
int score;
switch (change_type)
{
case 'A':
file_change.change_type = FileChangeType::Add;
++commit.files_added;
break;
case 'D':
file_change.change_type = FileChangeType::Delete;
++commit.files_deleted;
break;
case 'M':
file_change.change_type = FileChangeType::Modify;
++commit.files_modified;
break;
case 'R':
file_change.change_type = FileChangeType::Rename;
++commit.files_renamed;
2020-09-07 00:17:26 +00:00
readText(score, in);
break;
case 'C':
file_change.change_type = FileChangeType::Copy;
2020-09-07 00:17:26 +00:00
readText(score, in);
break;
case 'T':
file_change.change_type = FileChangeType::Type;
break;
default:
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected file change type: {}", change_type);
}
skipWhitespaceIfAny(in);
if (change_type == 'R' || change_type == 'C')
{
2020-09-06 00:47:00 +00:00
readText(file_change.old_path, in);
skipWhitespaceIfAny(in);
2020-09-06 00:47:00 +00:00
readText(file_change.path, in);
}
else
{
2020-09-06 00:47:00 +00:00
readText(file_change.path, in);
}
2020-09-06 00:47:00 +00:00
file_change.file_extension = std::filesystem::path(file_change.path).extension();
2020-09-07 00:17:26 +00:00
/// It gives us extension in form of '.cpp'. There is a reason for it but we remove initial dot for simplicity.
if (!file_change.file_extension.empty() && file_change.file_extension.front() == '.')
file_change.file_extension = file_change.file_extension.substr(1, std::string::npos);
2020-09-06 00:47:00 +00:00
assertChar('\n', in);
2020-09-06 00:47:00 +00:00
if (!(options.skip_paths && re2_st::RE2::PartialMatch(file_change.path, *options.skip_paths)))
2020-09-06 00:09:40 +00:00
{
file_changes.emplace(
2020-09-06 00:47:00 +00:00
file_change.path,
2020-09-07 00:17:26 +00:00
FileDiff(file_change));
2020-09-06 00:09:40 +00:00
}
}
2020-09-07 00:17:26 +00:00
}
2020-09-07 00:17:26 +00:00
/** Process the list of diffs for every file from the result of "git show".
* Caveats:
* - changes in binary files can be ignored;
* - if a line content begins with '+' or '-' it will be skipped
* it means that if you store diffs in repository and "git show" will display diff-of-diff for you,
* it won't be processed correctly;
* - we expect some specific format of the diff; but it may actually depend on git config;
* - non-ASCII file names are not processed correctly (they will not be found and will be ignored).
*/
void processDiffs(
ReadBuffer & in,
std::optional<size_t> size_limit,
Commit & commit,
CommitDiff & file_changes)
{
std::string old_file_path;
std::string new_file_path;
FileDiff * file_change_and_line_changes = nullptr;
LineChange line_change;
/// Diffs for every file in form of
/// --- a/src/Storages/StorageReplicatedMergeTree.cpp
/// +++ b/src/Storages/StorageReplicatedMergeTree.cpp
/// @@ -1387,2 +1387 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
/// - table_lock, entry.create_time, reserved_space, entry.deduplicate,
/// - entry.force_ttl);
/// + table_lock, entry.create_time, reserved_space, entry.deduplicate);
size_t diff_size = 0;
while (!in.eof())
{
if (checkString("@@ ", in))
{
2020-09-07 00:17:26 +00:00
if (!file_change_and_line_changes)
{
2020-09-07 00:17:26 +00:00
auto file_name = new_file_path.empty() ? old_file_path : new_file_path;
auto it = file_changes.find(file_name);
if (file_changes.end() != it)
file_change_and_line_changes = &it->second;
}
2020-09-06 06:38:48 +00:00
2020-09-07 00:17:26 +00:00
if (file_change_and_line_changes)
{
uint32_t old_lines = 1;
uint32_t new_lines = 1;
assertChar('-', in);
readText(line_change.hunk_start_line_number_old, in);
if (checkChar(',', in))
readText(old_lines, in);
assertString(" +", in);
readText(line_change.hunk_start_line_number_new, in);
if (checkChar(',', in))
readText(new_lines, in);
/// This is needed to simplify the logic of updating snapshot:
/// When all lines are removed we can treat it as repeated removal of line with number 1.
if (line_change.hunk_start_line_number_new == 0)
line_change.hunk_start_line_number_new = 1;
assertString(" @@", in);
if (checkChar(' ', in))
readStringUntilNextLine(line_change.hunk_context, in);
else
assertChar('\n', in);
2020-09-07 00:17:26 +00:00
line_change.hunk_lines_added = new_lines;
line_change.hunk_lines_deleted = old_lines;
2020-09-06 04:02:17 +00:00
2020-09-07 00:17:26 +00:00
++line_change.hunk_num;
line_change.line_number_old = line_change.hunk_start_line_number_old;
line_change.line_number_new = line_change.hunk_start_line_number_new;
2020-09-07 00:17:26 +00:00
if (old_lines && new_lines)
{
++commit.hunks_changed;
++file_change_and_line_changes->file_change.hunks_changed;
}
2020-09-07 00:17:26 +00:00
else if (old_lines)
{
2020-09-07 00:17:26 +00:00
++commit.hunks_removed;
++file_change_and_line_changes->file_change.hunks_removed;
}
2020-09-07 00:17:26 +00:00
else if (new_lines)
{
2020-09-07 00:17:26 +00:00
++commit.hunks_added;
++file_change_and_line_changes->file_change.hunks_added;
}
}
2020-09-07 00:17:26 +00:00
}
else if (checkChar('-', in))
{
if (checkString("-- ", in))
{
2020-09-07 00:17:26 +00:00
if (checkString("a/", in))
{
2020-09-07 00:17:26 +00:00
readStringUntilNextLine(old_file_path, in);
line_change = LineChange{};
file_change_and_line_changes = nullptr;
}
2020-09-07 00:17:26 +00:00
else if (checkString("/dev/null", in))
{
2020-09-07 00:17:26 +00:00
old_file_path.clear();
assertChar('\n', in);
line_change = LineChange{};
file_change_and_line_changes = nullptr;
}
2020-09-07 00:17:26 +00:00
else
skipUntilNextLine(in); /// Actually it can be the line in diff. Skip it for simplicity.
}
else
{
2020-09-07 00:17:26 +00:00
++diff_size;
if (file_change_and_line_changes)
{
++commit.lines_deleted;
++file_change_and_line_changes->file_change.lines_deleted;
2020-09-06 00:09:40 +00:00
2020-09-07 00:17:26 +00:00
line_change.sign = -1;
readStringUntilNextLine(line_change.line, in);
line_change.setLineInfo(line_change.line);
2020-09-06 06:38:48 +00:00
2020-09-07 00:17:26 +00:00
file_change_and_line_changes->line_changes.push_back(line_change);
++line_change.line_number_old;
}
2020-09-06 06:38:48 +00:00
}
}
2020-09-07 00:17:26 +00:00
else if (checkChar('+', in))
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
if (checkString("++ ", in))
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
if (checkString("b/", in))
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
readStringUntilNextLine(new_file_path, in);
line_change = LineChange{};
file_change_and_line_changes = nullptr;
2020-09-06 06:38:48 +00:00
}
2020-09-07 00:17:26 +00:00
else if (checkString("/dev/null", in))
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
new_file_path.clear();
assertChar('\n', in);
line_change = LineChange{};
file_change_and_line_changes = nullptr;
2020-09-06 06:38:48 +00:00
}
2020-09-07 00:17:26 +00:00
else
skipUntilNextLine(in); /// Actually it can be the line in diff. Skip it for simplicity.
2020-09-06 06:38:48 +00:00
}
2020-09-07 00:17:26 +00:00
else
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
++diff_size;
if (file_change_and_line_changes)
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
++commit.lines_added;
++file_change_and_line_changes->file_change.lines_added;
line_change.sign = 1;
readStringUntilNextLine(line_change.line, in);
line_change.setLineInfo(line_change.line);
file_change_and_line_changes->line_changes.push_back(line_change);
++line_change.line_number_new;
2020-09-06 06:38:48 +00:00
}
}
}
2020-09-07 00:17:26 +00:00
else
2020-09-06 06:38:48 +00:00
{
2020-09-07 00:17:26 +00:00
/// Unknown lines are ignored.
skipUntilNextLine(in);
2020-09-06 06:38:48 +00:00
}
2020-09-07 00:17:26 +00:00
if (size_limit && diff_size > *size_limit)
return;
2020-09-06 06:38:48 +00:00
}
2020-09-07 00:17:26 +00:00
}
2020-09-06 06:38:48 +00:00
2020-09-07 00:17:26 +00:00
/** Process the "git show" result for a single commit. Append the result to tables.
*/
void processCommit(
ReadBuffer & in,
const Options & options,
size_t commit_num,
size_t total_commits,
std::string hash,
Snapshot & snapshot,
DiffHashes & diff_hashes,
ResultWriter & result)
{
Commit commit;
commit.hash = hash;
2020-09-07 00:17:26 +00:00
time_t commit_time;
readText(commit_time, in);
commit.time = commit_time;
assertChar('\0', in);
readNullTerminated(commit.author, in);
std::string parent_hash;
readNullTerminated(parent_hash, in);
readNullTerminated(commit.message, in);
if (options.skip_commits_with_messages && re2_st::RE2::PartialMatch(commit.message, *options.skip_commits_with_messages))
return;
std::string message_to_print = commit.message;
std::replace_if(message_to_print.begin(), message_to_print.end(), [](char c){ return std::iscntrl(c); }, ' ');
std::cerr << fmt::format("{}% {} {} {}\n",
commit_num * 100 / total_commits, toString(commit.time), hash, message_to_print);
if (options.skip_commits_without_parents && commit_num != 0 && parent_hash.empty())
{
std::cerr << "Warning: skipping commit without parents\n";
return;
}
2020-09-07 00:17:26 +00:00
if (!in.eof())
assertChar('\n', in);
CommitDiff file_changes;
processFileChanges(in, options, commit, file_changes);
if (!in.eof())
{
2020-09-07 00:17:26 +00:00
assertChar('\n', in);
processDiffs(in, commit_num != 0 ? options.diff_size_limit : std::nullopt, commit, file_changes);
}
2020-09-07 00:17:26 +00:00
/// Skip commits with too large diffs.
if (options.diff_size_limit && commit_num != 0 && commit.lines_added + commit.lines_deleted > *options.diff_size_limit)
return;
2020-09-07 00:17:26 +00:00
/// Calculate hash of diff and skip duplicates
if (options.skip_commits_with_duplicate_diffs && !diff_hashes.insert(diffHash(file_changes)).second)
return;
2020-09-07 00:17:26 +00:00
/// Update snapshot and blame info
updateSnapshot(snapshot, commit, file_changes);
2020-09-07 00:17:26 +00:00
/// Write the result
result.appendCommit(commit, file_changes);
}
2020-09-07 00:17:26 +00:00
/** Runs child process and allows to read the result.
* Multiple processes can be run for parallel processing.
*/
2020-09-06 04:02:17 +00:00
auto gitShow(const std::string & hash)
{
std::string command = fmt::format(
2020-09-06 07:38:39 +00:00
"git show --raw --pretty='format:%ct%x00%aN%x00%P%x00%s%x00' --patch --unified=0 {}",
2020-09-06 04:02:17 +00:00
hash);
return ShellCommand::execute(command);
}
2020-09-07 00:17:26 +00:00
/** Obtain the list of commits and process them.
*/
2020-09-06 00:09:40 +00:00
void processLog(const Options & options)
{
2020-09-07 00:17:26 +00:00
ResultWriter result;
2020-09-06 00:09:40 +00:00
std::string command = "git log --reverse --no-merges --pretty=%H";
fmt::print("{}\n", command);
auto git_log = ShellCommand::execute(command);
2020-09-06 00:09:40 +00:00
/// Collect hashes in memory. This is inefficient but allows to display beautiful progress.
/// The number of commits is in order of single millions for the largest repositories,
/// so don't care about potential waste of ~100 MB of memory.
std::vector<std::string> hashes;
auto & in = git_log->out;
while (!in.eof())
{
std::string hash;
readString(hash, in);
assertChar('\n', in);
2020-09-06 00:09:40 +00:00
if (!options.skip_commits.count(hash))
hashes.emplace_back(std::move(hash));
}
size_t num_commits = hashes.size();
fmt::print("Total {} commits to process.\n", num_commits);
2020-09-06 04:02:17 +00:00
/// Will run multiple processes in parallel
size_t num_threads = options.threads;
std::vector<std::unique_ptr<ShellCommand>> show_commands(num_threads);
for (size_t i = 0; i < num_commits && i < num_threads; ++i)
show_commands[i] = gitShow(hashes[i]);
2020-09-06 00:47:00 +00:00
Snapshot snapshot;
2020-09-06 06:38:48 +00:00
DiffHashes diff_hashes;
2020-09-06 00:09:40 +00:00
for (size_t i = 0; i < num_commits; ++i)
{
2020-09-07 00:17:26 +00:00
processCommit(show_commands[i % num_threads]->out, options, i, num_commits, hashes[i], snapshot, diff_hashes, result);
2020-09-06 23:24:31 +00:00
if (!options.stop_after_commit.empty() && hashes[i] == options.stop_after_commit)
break;
2020-09-06 04:02:17 +00:00
if (i + num_threads < num_commits)
show_commands[i % num_threads] = gitShow(hashes[i + num_threads]);
}
}
}
2020-09-06 00:09:40 +00:00
int main(int argc, char ** argv)
try
{
using namespace DB;
2020-09-06 00:09:40 +00:00
po::options_description desc("Allowed options");
desc.add_options()
("help,h", "produce help message")
("skip-commits-without-parents", po::value<bool>()->default_value(true),
"Skip commits without parents (except the initial commit)."
" These commits are usually erroneous but they can make sense in very rare cases.")
2020-09-06 06:38:48 +00:00
("skip-commits-with-duplicate-diffs", po::value<bool>()->default_value(true),
"Skip commits with duplicate diffs."
" These commits are usually results of cherry-pick or merge after rebase.")
2020-09-06 00:09:40 +00:00
("skip-commit", po::value<std::vector<std::string>>(),
"Skip commit with specified hash. The option can be specified multiple times.")
2020-09-06 06:38:48 +00:00
("skip-paths", po::value<std::string>(),
"Skip paths that matches regular expression (re2 syntax).")
("skip-commits-with-messages", po::value<std::string>(),
"Skip commits whose messages matches regular expression (re2 syntax).")
2020-09-06 23:24:31 +00:00
("diff-size-limit", po::value<size_t>()->default_value(100000),
"Skip commits whose diff size (number of added + removed lines) is larger than specified threshold. Does not apply for initial commit.")
("stop-after-commit", po::value<std::string>(),
"Stop processing after specified commit hash.")
2020-09-06 04:02:17 +00:00
("threads", po::value<size_t>()->default_value(std::thread::hardware_concurrency()),
2020-09-06 23:24:31 +00:00
"Number of concurrent git subprocesses to spawn")
2020-09-06 00:09:40 +00:00
;
po::variables_map options;
po::store(boost::program_options::parse_command_line(argc, argv, desc), options);
if (options.count("help"))
{
2020-09-06 00:09:40 +00:00
std::cout << documentation << '\n'
<< "Usage: " << argv[0] << '\n'
<< desc << '\n'
<< "\nExample:\n"
2020-09-06 23:24:31 +00:00
<< "\n./git-to-clickhouse --skip-paths 'generated\\.cpp|^(contrib|docs?|website|libs/(libcityhash|liblz4|libdivide|libvectorclass|libdouble-conversion|libcpuid|libzstd|libfarmhash|libmetrohash|libpoco|libwidechar_width))/' --skip-commits-with-messages '^Merge branch '\n";
return 1;
2020-09-06 00:09:40 +00:00
}
2020-09-06 00:09:40 +00:00
processLog(options);
return 0;
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(true) << '\n';
throw;
}