2022-01-24 17:41:13 +00:00
|
|
|
#include <Common/FileChecker.h>
|
|
|
|
#include <Common/escapeForFileName.h>
|
2023-04-08 04:47:21 +00:00
|
|
|
#include <Common/logger_useful.h>
|
|
|
|
#include <Common/ErrorCodes.h>
|
2022-01-24 17:41:13 +00:00
|
|
|
#include <Disks/IDisk.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/WriteBufferFromFile.h>
|
|
|
|
#include <IO/ReadBufferFromFile.h>
|
|
|
|
#include <IO/WriteBufferFromString.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
2022-01-24 17:41:13 +00:00
|
|
|
#include <base/JSON.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
|
2015-09-29 14:09:01 +00:00
|
|
|
|
2022-01-24 17:41:13 +00:00
|
|
|
namespace fs = std::filesystem;
|
2015-09-29 14:09:01 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2020-07-12 02:31:58 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int UNEXPECTED_END_OF_FILE;
|
2021-08-26 22:15:24 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
2020-07-12 02:31:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-01-24 17:41:13 +00:00
|
|
|
FileChecker::FileChecker(const String & file_info_path_) : FileChecker(nullptr, file_info_path_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-04-08 04:47:21 +00:00
|
|
|
FileChecker::FileChecker(DiskPtr disk_, const String & file_info_path_)
|
|
|
|
: disk(std::move(disk_))
|
2024-01-23 17:04:50 +00:00
|
|
|
, log(getLogger("FileChecker"))
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2017-02-06 12:09:54 +00:00
|
|
|
setPath(file_info_path_);
|
2021-05-25 11:11:20 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
load();
|
|
|
|
}
|
|
|
|
catch (DB::Exception & e)
|
|
|
|
{
|
|
|
|
e.addMessage("Error loading file {}", files_info_path);
|
|
|
|
throw;
|
|
|
|
}
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2019-12-12 08:57:25 +00:00
|
|
|
void FileChecker::setPath(const String & file_info_path_)
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
|
|
|
files_info_path = file_info_path_;
|
|
|
|
}
|
|
|
|
|
2021-10-26 09:48:31 +00:00
|
|
|
String FileChecker::getPath() const
|
|
|
|
{
|
|
|
|
return files_info_path;
|
|
|
|
}
|
|
|
|
|
2020-07-12 02:31:58 +00:00
|
|
|
void FileChecker::update(const String & full_file_path)
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2022-01-24 17:41:13 +00:00
|
|
|
bool exists = fileReallyExists(full_file_path);
|
|
|
|
auto real_size = exists ? getRealFileSize(full_file_path) : 0; /// No race condition assuming no one else is working with these files.
|
2021-08-26 22:15:24 +00:00
|
|
|
map[fileName(full_file_path)] = real_size;
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2020-07-12 02:31:58 +00:00
|
|
|
void FileChecker::setEmpty(const String & full_file_path)
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2020-07-12 02:31:58 +00:00
|
|
|
map[fileName(full_file_path)] = 0;
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2021-08-26 22:15:24 +00:00
|
|
|
size_t FileChecker::getFileSize(const String & full_file_path) const
|
|
|
|
{
|
|
|
|
auto it = map.find(fileName(full_file_path));
|
|
|
|
if (it == map.end())
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "File {} is not added to the file checker", full_file_path);
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
2022-05-03 17:55:45 +00:00
|
|
|
size_t FileChecker::getTotalSize() const
|
2022-05-02 22:01:11 +00:00
|
|
|
{
|
2022-05-03 17:55:45 +00:00
|
|
|
size_t total_size = 0;
|
2022-05-02 22:01:11 +00:00
|
|
|
for (auto size : map | boost::adaptors::map_values)
|
|
|
|
total_size += size;
|
|
|
|
return total_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-08-14 09:58:08 +00:00
|
|
|
FileChecker::DataValidationTasksPtr FileChecker::getDataValidationTasks()
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2023-08-14 09:58:08 +00:00
|
|
|
return std::make_unique<DataValidationTasks>(map);
|
|
|
|
}
|
2021-08-26 22:15:24 +00:00
|
|
|
|
2023-10-23 10:12:30 +00:00
|
|
|
std::optional<CheckResult> FileChecker::checkNextEntry(DataValidationTasksPtr & check_data_tasks) const
|
2023-08-14 09:58:08 +00:00
|
|
|
{
|
|
|
|
String name;
|
|
|
|
size_t expected_size;
|
|
|
|
bool is_finished = check_data_tasks->next(name, expected_size);
|
|
|
|
if (is_finished)
|
|
|
|
return {};
|
2023-08-10 11:44:16 +00:00
|
|
|
|
2023-08-14 09:58:08 +00:00
|
|
|
String path = parentPath(files_info_path) + name;
|
|
|
|
bool exists = fileReallyExists(path);
|
|
|
|
auto real_size = exists ? getRealFileSize(path) : 0; /// No race condition assuming no one else is working with these files.
|
2023-08-10 11:44:16 +00:00
|
|
|
|
2023-08-14 09:58:08 +00:00
|
|
|
if (real_size != expected_size)
|
|
|
|
{
|
|
|
|
String failure_message = exists
|
|
|
|
? ("Size of " + path + " is wrong. Size is " + toString(real_size) + " but should be " + toString(expected_size))
|
|
|
|
: ("File " + path + " doesn't exist");
|
|
|
|
return CheckResult(name, false, failure_message);
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
2023-08-01 10:57:59 +00:00
|
|
|
|
2023-08-14 09:58:08 +00:00
|
|
|
return CheckResult(name, true, "");
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2020-07-12 02:31:58 +00:00
|
|
|
void FileChecker::repair()
|
|
|
|
{
|
|
|
|
for (const auto & name_size : map)
|
|
|
|
{
|
|
|
|
const String & name = name_size.first;
|
|
|
|
size_t expected_size = name_size.second;
|
|
|
|
String path = parentPath(files_info_path) + name;
|
2022-01-24 17:41:13 +00:00
|
|
|
bool exists = fileReallyExists(path);
|
|
|
|
auto real_size = exists ? getRealFileSize(path) : 0; /// No race condition assuming no one else is working with these files.
|
2020-07-12 02:31:58 +00:00
|
|
|
|
|
|
|
if (real_size < expected_size)
|
|
|
|
throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Size of {} is less than expected. Size is {} but should be {}.",
|
|
|
|
path, real_size, expected_size);
|
|
|
|
|
|
|
|
if (real_size > expected_size)
|
|
|
|
{
|
2021-08-26 22:15:24 +00:00
|
|
|
LOG_WARNING(log, "Will truncate file {} that has size {} to size {}", path, real_size, expected_size);
|
2020-07-12 02:31:58 +00:00
|
|
|
disk->truncateFile(path, expected_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-29 14:09:01 +00:00
|
|
|
void FileChecker::save() const
|
|
|
|
{
|
2021-01-03 21:07:26 +00:00
|
|
|
std::string tmp_files_info_path = parentPath(files_info_path) + "tmp_" + fileName(files_info_path);
|
|
|
|
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2023-05-27 09:47:44 +00:00
|
|
|
std::unique_ptr<WriteBufferFromFileBase> out = disk ? disk->writeFile(tmp_files_info_path) : std::make_unique<WriteBufferFromFile>(tmp_files_info_path);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-25 20:12:56 +00:00
|
|
|
/// So complex JSON structure - for compatibility with the old format.
|
2021-09-21 18:05:08 +00:00
|
|
|
writeCString("{\"clickhouse\":{", *out);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-07 17:04:39 +00:00
|
|
|
auto settings = FormatSettings();
|
2015-09-29 14:09:01 +00:00
|
|
|
for (auto it = map.begin(); it != map.end(); ++it)
|
|
|
|
{
|
|
|
|
if (it != map.begin())
|
2019-12-12 08:57:25 +00:00
|
|
|
writeString(",", *out);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-26 01:28:07 +00:00
|
|
|
/// `escapeForFileName` is not really needed. But it is left for compatibility with the old code.
|
2019-12-12 08:57:25 +00:00
|
|
|
writeJSONString(escapeForFileName(it->first), *out, settings);
|
2020-03-08 21:04:10 +00:00
|
|
|
writeString(R"(:{"size":")", *out);
|
2019-12-12 08:57:25 +00:00
|
|
|
writeIntText(it->second, *out);
|
|
|
|
writeString("\"}", *out);
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-12-12 08:57:25 +00:00
|
|
|
writeCString("}}", *out);
|
2023-05-27 09:47:44 +00:00
|
|
|
|
|
|
|
out->sync();
|
|
|
|
out->finalize();
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-01-24 17:41:13 +00:00
|
|
|
if (disk)
|
|
|
|
disk->replaceFile(tmp_files_info_path, files_info_path);
|
|
|
|
else
|
|
|
|
fs::rename(tmp_files_info_path, files_info_path);
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2021-01-03 21:07:26 +00:00
|
|
|
void FileChecker::load()
|
2015-09-29 14:09:01 +00:00
|
|
|
{
|
2021-01-03 21:07:26 +00:00
|
|
|
map.clear();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-01-24 17:41:13 +00:00
|
|
|
if (!fileReallyExists(files_info_path))
|
2015-09-29 14:09:01 +00:00
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-02-22 17:05:52 +00:00
|
|
|
std::unique_ptr<ReadBuffer> in = disk ? disk->readFile(files_info_path) : std::make_unique<ReadBufferFromFile>(files_info_path);
|
2017-07-31 21:39:24 +00:00
|
|
|
WriteBufferFromOwnString out;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-31 21:39:24 +00:00
|
|
|
/// The JSON library does not support whitespace. We delete them. Inefficient.
|
2019-12-12 08:57:25 +00:00
|
|
|
while (!in->eof())
|
2017-07-31 21:39:24 +00:00
|
|
|
{
|
|
|
|
char c;
|
2019-12-12 08:57:25 +00:00
|
|
|
readChar(c, *in);
|
2017-07-31 21:39:24 +00:00
|
|
|
if (!isspace(c))
|
|
|
|
writeChar(c, out);
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
2017-07-31 21:39:24 +00:00
|
|
|
JSON json(out.str());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-09-21 18:05:08 +00:00
|
|
|
JSON files = json.has("clickhouse") ? json["clickhouse"] : json["yandex"];
|
2020-03-08 21:04:10 +00:00
|
|
|
for (const JSON file : files) // NOLINT
|
2021-01-03 21:07:26 +00:00
|
|
|
map[unescapeForFileName(file.getName())] = file.getValue()["size"].toUInt();
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|
|
|
|
|
2022-01-24 17:41:13 +00:00
|
|
|
bool FileChecker::fileReallyExists(const String & path_) const
|
|
|
|
{
|
|
|
|
return disk ? disk->exists(path_) : fs::exists(path_);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t FileChecker::getRealFileSize(const String & path_) const
|
|
|
|
{
|
|
|
|
return disk ? disk->getFileSize(path_) : fs::file_size(path_);
|
|
|
|
}
|
|
|
|
|
2015-09-29 14:09:01 +00:00
|
|
|
}
|