ClickHouse/dbms/src/Storages/MergeTree/MergeTreePartChecker.cpp

264 lines
7.8 KiB
C++
Raw Normal View History

#include <DB/Storages/MergeTree/MergeTreePartChecker.h>
#include <DB/DataTypes/DataTypeString.h>
#include <DB/DataTypes/DataTypeDate.h>
#include <DB/DataTypes/DataTypeDateTime.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <DB/DataTypes/DataTypeFixedString.h>
#include <DB/IO/CompressedReadBuffer.h>
#include <DB/IO/HashingReadBuffer.h>
#include <DB/Columns/ColumnsNumber.h>
namespace DB
{
struct Stream
{
DataTypePtr type;
String path;
String name;
ReadBufferFromFile file_buf;
HashingReadBuffer compressed_hashing_buf;
CompressedReadBuffer uncompressing_buf;
HashingReadBuffer uncompressed_hashing_buf;
ReadBufferFromFile mrk_file_buf;
HashingReadBuffer mrk_hashing_buf;
Stream(const String & path, const String & name, DataTypePtr type_) : type(type_),
file_buf(path + name + ".bin"), compressed_hashing_buf(file_buf), uncompressing_buf(compressed_hashing_buf),
uncompressed_hashing_buf(uncompressing_buf), mrk_file_buf(path + name + ".mrk"), mrk_hashing_buf(mrk_file_buf)
{
std::cerr << "hi: " << compressed_hashing_buf.count() << ' ' << uncompressed_hashing_buf.offset() << std::endl;
}
bool dataEOF()
{
return uncompressed_hashing_buf.eof();
}
size_t read(size_t rows)
{
if (dynamic_cast<const DataTypeString *>(&*type))
{
for (size_t i = 0; i < rows; ++i)
{
if (uncompressed_hashing_buf.eof())
return i;
UInt64 size;
readVarUInt(size, uncompressed_hashing_buf);
if (size > (1ul << 31))
throw Exception("A string of length " + toString(size) + " is too long.", ErrorCodes::CORRUPTED_DATA);
uncompressed_hashing_buf.ignore(size);
}
return rows;
}
else
{
size_t length;
if( dynamic_cast<const DataTypeUInt8 *>(&*type) ||
dynamic_cast<const DataTypeInt8 *>(&*type))
length = sizeof(UInt8);
else if(dynamic_cast<const DataTypeUInt16 *>(&*type) ||
dynamic_cast<const DataTypeInt16 *>(&*type) ||
dynamic_cast<const DataTypeDate *>(&*type))
length = sizeof(UInt16);
else if(dynamic_cast<const DataTypeUInt32 *>(&*type) ||
dynamic_cast<const DataTypeInt32 *>(&*type) ||
dynamic_cast<const DataTypeFloat32 *>(&*type) ||
dynamic_cast<const DataTypeDateTime *>(&*type))
length = sizeof(UInt32);
else if(dynamic_cast<const DataTypeUInt64 *>(&*type) ||
dynamic_cast<const DataTypeInt64 *>(&*type) ||
dynamic_cast<const DataTypeFloat64 *>(&*type))
length = sizeof(UInt64);
else if (auto string = dynamic_cast<const DataTypeFixedString *>(&*type))
length = string->getN();
else
throw Exception("Unexpected data type: " + type->getName() + " of column " + name, ErrorCodes::UNKNOWN_TYPE);
size_t size = uncompressed_hashing_buf.tryIgnore(length * rows);
if (size % length)
throw Exception("Read " + toString(size) + " bytes, which is not divisible by " + toString(length),
ErrorCodes::CORRUPTED_DATA);
return size / length;
}
}
size_t readUInt64(size_t rows, ColumnUInt64::Container_t & data)
{
if (data.size() < rows)
data.resize(rows);
size_t size = uncompressed_hashing_buf.readBig(reinterpret_cast<char *>(&data[0]), sizeof(UInt64) * rows);
if (size % sizeof(UInt64))
throw Exception("Read " + toString(size) + " bytes, which is not divisible by " + toString(sizeof(UInt64)),
ErrorCodes::CORRUPTED_DATA);
return size / sizeof(UInt64);
}
void assertMark()
{
MarkInCompressedFile mrk_mark;
readIntBinary(mrk_mark.offset_in_compressed_file, mrk_hashing_buf);
readIntBinary(mrk_mark.offset_in_decompressed_block, mrk_hashing_buf);
MarkInCompressedFile data_mark;
data_mark.offset_in_compressed_file = compressed_hashing_buf.count();
data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset();
if (mrk_mark == data_mark)
return;
/// Если засечка должна быть ровно на границе блоков, нам подходит и засечка, указывающая на конец предыдущего блока, и на начало следующего.
uncompressed_hashing_buf.nextIfAtEnd();
data_mark.offset_in_compressed_file = compressed_hashing_buf.count();
data_mark.offset_in_decompressed_block = uncompressed_hashing_buf.offset();
if (mrk_mark != data_mark)
throw Exception("Incorrect mark: " + data_mark.toString() + " in data, " + mrk_mark.toString() + " in .mrk file",
ErrorCodes::INCORRECT_MARK);
}
void assertEnd(MergeTreeData::DataPart::Checksums & checksums)
{
if (!uncompressed_hashing_buf.eof())
throw Exception("EOF expected in column data", ErrorCodes::CORRUPTED_DATA);
if (!mrk_hashing_buf.eof())
throw Exception("EOF expected in .mrk file", ErrorCodes::CORRUPTED_DATA);
checksums.files[name + ".bin"] = MergeTreeData::DataPart::Checksums::Checksum(
compressed_hashing_buf.count(), compressed_hashing_buf.getHash(),
uncompressed_hashing_buf.count(), uncompressed_hashing_buf.getHash());
checksums.files[name + ".mrk"] = MergeTreeData::DataPart::Checksums::Checksum(
mrk_hashing_buf.count(), mrk_hashing_buf.getHash());
}
};
/// Возвращает количество строк. Добавляет в checksums чексуммы всех файлов столбца.
static size_t checkColumn(const String & path, const String & name, DataTypePtr type, size_t index_granularity,
MergeTreeData::DataPart::Checksums & checksums)
{
size_t rows = 0;
try
{
if (auto array = dynamic_cast<const DataTypeArray *>(&*type))
{
Stream sizes_stream(path, name + ".size0", new DataTypeUInt64);
Stream data_stream(path, name, array->getNestedType());
ColumnUInt64::Container_t sizes;
while (true)
{
if (sizes_stream.dataEOF())
break;
sizes_stream.assertMark();
data_stream.assertMark();
size_t cur_rows = sizes_stream.readUInt64(index_granularity, sizes);
size_t sum = 0;
for (size_t i = 0; i < cur_rows; ++i)
{
size_t new_sum = sum + sizes[i];
if (sizes[i] > (1ul << 31) || new_sum < sum)
throw Exception("Array size " + toString(sizes[i]) + " is too long.", ErrorCodes::CORRUPTED_DATA);
sum = new_sum;
}
data_stream.read(sum);
rows += cur_rows;
if (cur_rows < index_granularity)
break;
}
sizes_stream.assertEnd(checksums);
data_stream.assertEnd(checksums);
return rows;
}
else
{
Stream data_stream(path, name, type);
size_t rows = 0;
while (true)
{
if (data_stream.dataEOF())
break;
data_stream.assertMark();
size_t cur_rows = data_stream.read(index_granularity);
rows += cur_rows;
if (cur_rows < index_granularity)
break;
}
data_stream.assertEnd(checksums);
return rows;
}
}
catch (DB::Exception & e)
{
e.addMessage(" (column: " + path + name + ", last mark at " + toString(rows) + " rows)");
throw;
}
}
void MergeTreePartChecker::checkDataPart(String path, size_t index_granularity, const DataTypeFactory & data_type_factory)
{
if (!path.empty() && *path.rbegin() != '/')
path += "/";
NamesAndTypesList columns;
MergeTreeData::DataPart::Checksums checksums_txt;
{
ReadBufferFromFile buf(path + "columns.txt");
columns.readText(buf, data_type_factory);
assertEOF(buf);
}
if (Poco::File(path + "checksums.txt").exists())
{
ReadBufferFromFile buf(path + "checksums.txt");
checksums_txt.readText(buf);
assertEOF(buf);
}
MergeTreeData::DataPart::Checksums checksums_data;
bool first = true;
size_t rows = 0;
for (const NameAndTypePair & column : columns)
{
size_t cur_rows = checkColumn(path, escapeForFileName(column.name), column.type, index_granularity, checksums_data);
if (first)
{
rows = cur_rows;
first = false;
}
else if (rows != cur_rows)
{
throw Exception("Different number of rows in columns " + columns.begin()->name + " and " + column.name,
ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
}
std::cerr << "column " << column.name << " ok" << std::endl;
}
checksums_txt.checkEqual(checksums_data, true);
}
}