ClickHouse/utils/convert-month-partitioned-parts/main.cpp

149 lines
5.6 KiB
C++
Raw Normal View History

#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/HashingWriteBuffer.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/WriteBufferFromFile.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeTreePartInfo.h>
#include <Storages/MergeTree/localBackup.h>
#include <Common/Exception.h>
#include <boost/program_options.hpp>
#include <Poco/Path.h>
#include <Poco/File.h>
#include <iostream>
2020-02-28 11:54:18 +00:00
#include <Disks/DiskLocal.h>
namespace DB
{
namespace ErrorCodes
{
extern const int DIRECTORY_ALREADY_EXISTS;
extern const int BAD_DATA_PART_NAME;
extern const int NO_FILE_IN_DATA_PART;
}
void run(String part_path, String date_column, String dest_path)
{
2020-02-28 11:54:18 +00:00
std::shared_ptr<IDisk> disk = std::make_shared<DiskLocal>("local", "/", 0);
auto old_part_path = Poco::Path::forDirectory(part_path);
const String & old_part_name = old_part_path.directory(old_part_path.depth() - 1);
String old_part_path_str = old_part_path.toString();
auto part_info = MergeTreePartInfo::fromPartName(old_part_name, MergeTreeDataFormatVersion(0));
String new_part_name = part_info.getPartName();
auto new_part_path = Poco::Path::forDirectory(dest_path);
new_part_path.pushDirectory(new_part_name);
if (Poco::File(new_part_path).exists())
throw Exception("Destination part directory `" + new_part_path.toString() + "` already exists",
ErrorCodes::DIRECTORY_ALREADY_EXISTS);
DayNum min_date;
DayNum max_date;
MergeTreePartInfo::parseMinMaxDatesFromPartName(old_part_name, min_date, max_date);
Extended range of DateTime64 to years 1925 - 2238 The Year 1925 is a starting point because most of the timezones switched to saner (mostly 15-minutes based) offsets somewhere during 1924 or before. And that significantly simplifies implementation. 2238 is to simplify arithmetics for sanitizing LUT index access; there are less than 0x1ffff days from 1925. * Extended DateLUTImpl internal LUT to 0x1ffff items, some of which represent negative (pre-1970) time values. As a collateral benefit, Date now correctly supports dates up to 2149 (instead of 2106). * Added a new strong typedef ExtendedDayNum, which represents dates pre-1970 and post 2149. * Functions that used to return DayNum now return ExtendedDayNum. * Refactored DateLUTImpl to untie DayNum from the dual role of being a value and an index (due to negative time). Index is now a different type LUTIndex with explicit conversion functions from DatNum, time_t, and ExtendedDayNum. * Updated DateLUTImpl to properly support values close to epoch start (1970-01-01 00:00), including negative ones. * Reduced resolution of DateLUTImpl::Values::time_at_offset_change to multiple of 15-minutes to allow storing 64-bits of time_t in DateLUTImpl::Value while keeping same size. * Minor performance updates to DateLUTImpl when building month LUT by skipping non-start-of-month days. * Fixed extractTimeZoneFromFunctionArguments to work correctly with DateTime64. * New unit-tests and stateless integration tests for both DateTime and DateTime64.
2020-04-17 13:26:44 +00:00
const auto & time_zone = DateLUT::instance();
UInt32 yyyymm = time_zone.toNumYYYYMM(min_date);
if (yyyymm != time_zone.toNumYYYYMM(max_date))
throw Exception("Part " + old_part_name + " spans different months",
ErrorCodes::BAD_DATA_PART_NAME);
ReadBufferFromFile checksums_in(old_part_path_str + "checksums.txt", 4096);
MergeTreeDataPartChecksums checksums;
checksums.read(checksums_in);
auto date_col_checksum_it = checksums.files.find(date_column + ".bin");
if (date_col_checksum_it == checksums.files.end())
throw Exception("Couldn't find checksum for the date column .bin file `" + date_column + ".bin`",
ErrorCodes::NO_FILE_IN_DATA_PART);
UInt64 rows = date_col_checksum_it->second.uncompressed_size / DataTypeDate().getSizeOfValueInMemory();
auto new_tmp_part_path = Poco::Path::forDirectory(dest_path);
new_tmp_part_path.pushDirectory("tmp_convert_" + new_part_name);
String new_tmp_part_path_str = new_tmp_part_path.toString();
try
{
Poco::File(new_tmp_part_path).remove(/* recursive = */ true);
}
catch (const Poco::FileNotFoundException &)
{
/// If the file is already deleted, do nothing.
}
localBackup(disk, old_part_path.toString(), new_tmp_part_path.toString(), {});
WriteBufferFromFile count_out(new_tmp_part_path_str + "count.txt", 4096);
HashingWriteBuffer count_out_hashing(count_out);
writeIntText(rows, count_out_hashing);
count_out_hashing.next();
checksums.files["count.txt"].file_size = count_out_hashing.count();
checksums.files["count.txt"].file_hash = count_out_hashing.getHash();
IMergeTreeDataPart::MinMaxIndex minmax_idx(min_date, max_date);
Names minmax_idx_columns = {date_column};
DataTypes minmax_idx_column_types = {std::make_shared<DataTypeDate>()};
2020-02-28 11:54:18 +00:00
minmax_idx.store(minmax_idx_columns, minmax_idx_column_types, disk, new_tmp_part_path_str, checksums);
Block partition_key_sample{{nullptr, std::make_shared<DataTypeUInt32>(), makeASTFunction("toYYYYMM", std::make_shared<ASTIdentifier>(date_column))->getColumnName()}};
MergeTreePartition partition(yyyymm);
2020-02-28 11:54:18 +00:00
partition.store(partition_key_sample, disk, new_tmp_part_path_str, checksums);
String partition_id = partition.getID(partition_key_sample);
Poco::File(new_tmp_part_path_str + "checksums.txt").setWriteable();
WriteBufferFromFile checksums_out(new_tmp_part_path_str + "checksums.txt", 4096);
checksums.write(checksums_out);
2021-02-15 11:37:07 +00:00
checksums_in.close();
checksums_out.close();
Poco::File(new_tmp_part_path).renameTo(new_part_path.toString());
}
}
int main(int argc, char ** argv)
try
{
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("help,h", "produce help message")
("part", boost::program_options::value<std::string>()->required(),
"part directory to convert")
("date-column", boost::program_options::value<std::string>()->required(),
"name of the date column")
("to", boost::program_options::value<std::string>()->required(),
"destination directory")
;
boost::program_options::variables_map options;
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
if (options.count("help") || options.size() < 3)
{
std::cout
<< "Convert a MergeTree part from the old-style month-partitioned table "
<< "(e.g. 20140317_20140323_2_2_0) to the format suitable for ATTACH'ing to a custom-partitioned "
<< "table (201403_2_2_0)." << std::endl << std::endl;
std::cout << desc << std::endl;
return 1;
}
auto part_path = options.at("part").as<DB::String>();
auto date_column = options.at("date-column").as<DB::String>();
auto dest_path = options.at("to").as<DB::String>();
DB::run(part_path, date_column, dest_path);
return 0;
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(true) << '\n';
throw;
}