Added a script to import git repository to ClickHouse

This commit is contained in:
Alexey Milovidov 2020-09-04 09:12:16 +03:00
parent 23b9677879
commit 338a6e20f6

View File

@ -16,6 +16,101 @@
#include <IO/WriteBufferFromFileDescriptor.h>
/** How to use:
DROP DATABASE IF EXISTS git;
CREATE DATABASE git;
CREATE TABLE git.commits
(
hash String,
author_name LowCardinality(String),
author_email LowCardinality(String),
time DateTime,
message String,
files_added UInt32,
files_deleted UInt32,
files_renamed UInt32,
files_modified UInt32,
lines_added UInt32,
lines_deleted UInt32,
hunks_added UInt32,
hunks_removed UInt32,
hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
CREATE TABLE git.file_changes
(
change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
new_file_path LowCardinality(String),
old_file_path LowCardinality(String),
lines_added UInt16,
lines_deleted UInt16,
hunks_added UInt16,
hunks_removed UInt16,
hunks_changed UInt16,
commit_hash String,
author_name LowCardinality(String),
author_email LowCardinality(String),
time DateTime,
commit_message String,
commit_files_added UInt32,
commit_files_deleted UInt32,
commit_files_renamed UInt32,
commit_files_modified UInt32,
commit_lines_added UInt32,
commit_lines_deleted UInt32,
commit_hunks_added UInt32,
commit_hunks_removed UInt32,
commit_hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
CREATE TABLE git.line_changes
(
sign Int8,
line_number_old UInt16,
line_number_new UInt16,
hunk_num UInt16,
hunk_start_line_number_old UInt16,
hunk_start_line_number_new UInt16,
hunk_context LowCardinality(String),
line LowCardinality(String),
indent UInt8,
line_type Enum('Empty' = 0, 'Comment' = 1, 'Punct' = 2, 'Code' = 3),
file_change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
new_file_path LowCardinality(String),
old_file_path LowCardinality(String),
file_lines_added UInt16,
file_lines_deleted UInt16,
file_hunks_added UInt16,
file_hunks_removed UInt16,
file_hunks_changed UInt16,
commit_hash String,
author_name LowCardinality(String),
author_email LowCardinality(String),
time DateTime,
commit_message String,
commit_files_added UInt32,
commit_files_deleted UInt32,
commit_files_renamed UInt32,
commit_files_modified UInt32,
commit_lines_added UInt32,
commit_lines_deleted UInt32,
commit_hunks_added UInt32,
commit_hunks_removed UInt32,
commit_hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time;
clickhouse-client --query "INSERT INTO git.commits FORMAT TSV" < commits.tsv
clickhouse-client --query "INSERT INTO git.file_changes FORMAT TSV" < file_changes.tsv
clickhouse-client --query "INSERT INTO git.line_changes FORMAT TSV" < line_changes.tsv
*/
namespace DB
{
@ -495,6 +590,7 @@ void processCommit(std::string hash, Result & result)
if (file_change_and_line_changes)
{
++commit.lines_deleted;
++file_change_and_line_changes->file_change.lines_deleted;
line_change.sign = -1;
readStringUntilNextLine(line_change.line, in);
@ -530,6 +626,7 @@ void processCommit(std::string hash, Result & result)
if (file_change_and_line_changes)
{
++commit.lines_added;
++file_change_and_line_changes->file_change.lines_added;
line_change.sign = 1;
readStringUntilNextLine(line_change.line, in);