Fix "check style" tool

This commit is contained in:
Alexey Milovidov 2020-04-06 02:51:26 +03:00
parent 73b0f8db8c
commit 646f409b8e
7 changed files with 10 additions and 423 deletions

View File

@ -12,7 +12,6 @@
#include <unistd.h>
#include <typeinfo>
#include <sys/time.h>
#include <sys/resource.h>
#include <iostream>
#include <fstream>

View File

@ -3,4 +3,4 @@
ROOT_PATH=$(git rev-parse --show-toplevel)
# Find duplicate include directives
find $ROOT_PATH/dbms -name '*.h' -or -name '*.cpp' | while read file; do grep -P '^#include ' $file | sort | uniq -c | grep -v -P '^\s+1\s' && echo $file; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | while read file; do grep -P '^#include ' $file | sort | uniq -c | grep -v -P '^\s+1\s' && echo $file; done

View File

@ -15,7 +15,7 @@
ROOT_PATH=$(git rev-parse --show-toplevel)
EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|memcpy/|consistent-hashing'
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|\t|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' |
# a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | a tab character | missing whitespace after for/if/while... before opening brace | whitespaces inside braces
@ -23,7 +23,7 @@ find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
# single-line comment | continuation of a multiline comment | a typical piece of embedded shell code | something like ending of raw string literal
# // namespace comments are unneeded
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
xargs grep $@ -P '}\s*//+\s*namespace\s*'
@ -31,23 +31,23 @@ find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found"
# Double whitespaces
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null | while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
# Unused ErrorCodes
# NOTE: to fix automatically, replace echo with:
# sed -i "/extern const int $code/d" $file
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'extern const int [_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sed -r -e 's/^.*?extern const int ([_A-Z]+);.*?$/\1/' | while read code; do grep -q "ErrorCodes::$code" $file || echo "ErrorCode $code is defined but not used in file $file"; done; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'extern const int [_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sed -r -e 's/^.*?extern const int ([_A-Z]+);.*?$/\1/' | while read code; do grep -q "ErrorCodes::$code" $file || echo "ErrorCode $code is defined but not used in file $file"; done; done
# Undefined ErrorCodes
# NOTE: to fix automatically, replace echo with:
# ( grep -q -F 'namespace ErrorCodes' $file && sed -i -r "0,/(\s*)extern const int [_A-Z]+/s//\1extern const int $code;\n&/" $file || awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace ErrorCodes\n{\n extern const int '$code';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file )
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'ErrorCodes::[_A-Z]+' $file | sed -r -e 's/^.*?ErrorCodes::([_A-Z]+).*?$/\1/' | while read code; do grep -q "extern const int $code" $file || echo "ErrorCode $code is used in file $file but not defined"; done; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'ErrorCodes::[_A-Z]+' $file | sed -r -e 's/^.*?ErrorCodes::([_A-Z]+).*?$/\1/' | while read code; do grep -q "extern const int $code" $file || echo "ErrorCode $code is used in file $file but not defined"; done; done
# Duplicate ErrorCodes
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate ErrorCode in file $file"; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate ErrorCode in file $file"; done
# Three or more consecutive empty lines
find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done
# Broken XML files (requires libxml2-utils)
find $ROOT_PATH/{dbms,base} -name '*.xml' | xargs xmllint --noout --nonet
find $ROOT_PATH/{src,base,programs,utils} -name '*.xml' | xargs xmllint --noout --nonet

View File

@ -1,5 +1,2 @@
add_executable (mutator mutator.cpp)
target_link_libraries(mutator PRIVATE clickhouse_common_io)
add_executable (decompress_perf decompress_perf.cpp)
target_link_libraries(decompress_perf PRIVATE dbms ${LZ4_LIBRARY})

View File

@ -1,406 +0,0 @@
#include <string.h>
#include <random>
#include <pcg_random.hpp>
#include <common/types.h>
#include <IO/ReadBuffer.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/BufferWithOwnMemory.h>
#include <Compression/CompressionInfo.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <Common/PODArray.h>
/** Quick and dirty implementation of data scrambler.
*
* The task is to replace the data with pseudorandom values.
* But with keeping some probability distributions
* and with maintaining the same compression ratio.
*
* The solution is to operate directly on compressed LZ4 stream.
* The stream consists of independent compressed blocks.
* Each block is a stream of "literals" and "matches".
* Liteal is an instruction to literally put some following bytes,
* and match is an instruction to copy some bytes that was already seen before.
*
* We get literals and apply some scramble operation on it.
* But we keep literal length and matches without changes.
*
* That's how we get pseudorandom data but with keeping
* all repetitive patterns and maintaining the same compression ratio.
*
* Actually, the compression ratio, if you decompress scrambled data and compress again
* become slightly worse, because LZ4 use simple match finder based on value of hash function,
* and it can find different matches due to collisions in hash function.
*
* Scramble operation replace literals with pseudorandom bytes,
* but with some heuristics to keep some sort of data structure.
*
* It's in question, is it scramble data enough and while is it safe to publish scrambled data.
* In general, you should assume that it is not safe.
*/
#define ML_BITS 4
#define ML_MASK ((1U<<ML_BITS)-1)
#define RUN_BITS (8-ML_BITS)
#define RUN_MASK ((1U<<RUN_BITS)-1)
#define MINMATCH 4
#define WILDCOPYLENGTH 8
#define LASTLITERALS 5
static UInt8 rand(pcg64 & generator, UInt8 min, UInt8 max)
{
return min + generator() % (max + 1 - min);
}
static void mutate(pcg64 & generator, void * src, size_t length)
{
UInt8 * pos = static_cast<UInt8 *>(src);
UInt8 * end = pos + length;
while (pos < end)
{
if (pos + strlen("https") <= end && 0 == memcmp(pos, "https", strlen("https")))
{
pos += strlen("https");
continue;
}
if (pos + strlen("http") <= end && 0 == memcmp(pos, "http", strlen("http")))
{
pos += strlen("http");
continue;
}
if (pos + strlen("www") <= end && 0 == memcmp(pos, "www", strlen("www")))
{
pos += strlen("www");
continue;
}
if (*pos >= '1' && *pos <= '9')
*pos = rand(generator, '1', '9');
else if (*pos >= 'a' && *pos <= 'z')
*pos = rand(generator, 'a', 'z');
else if (*pos >= 'A' && *pos <= 'Z')
*pos = rand(generator, 'A', 'Z');
else if (*pos >= 0x80 && *pos <= 0xBF)
*pos = rand(generator, *pos & 0xF0U, *pos | 0x0FU);
else if (*pos == '\\')
++pos;
++pos;
}
pos = static_cast<UInt8 *>(src);
while (pos < end)
{
if (pos + 3 <= end
&& isAlphaASCII(pos[0])
&& !isAlphaASCII(pos[1]) && pos[1] != '\\' && pos[1] >= 0x20
&& isAlphaASCII(pos[2]))
{
auto res = rand(generator, 0, 3);
if (res == 2)
{
std::swap(pos[0], pos[1]);
}
else if (res == 3)
std::swap(pos[1], pos[2]);
pos += 3;
}
else if (pos + 5 <= end
&& pos[0] >= 0xC0 && pos[0] <= 0xDF && pos[1] >= 0x80 && pos[1] <= 0xBF
&& pos[2] >= 0x20 && pos[2] < 0x80 && !isAlphaASCII(pos[2])
&& pos[3] >= 0xC0 && pos[3] <= 0xDF && pos[4] >= 0x80 && pos[4] <= 0xBF)
{
auto res = rand(generator, 0, 3);
if (res == 2)
{
std::swap(pos[1], pos[2]);
std::swap(pos[0], pos[1]);
}
else if (res == 3)
{
std::swap(pos[3], pos[2]);
std::swap(pos[4], pos[3]);
}
pos += 5;
}
else
++pos;
}
}
static void LZ4_copy8(void* dst, const void* src)
{
memcpy(dst,src,8);
}
/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
{
UInt8* d = (UInt8*)dstPtr;
const UInt8* s = (const UInt8*)srcPtr;
UInt8* const e = (UInt8*)dstEnd;
do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
}
static UInt16 LZ4_read16(const void* memPtr)
{
UInt16 val; memcpy(&val, memPtr, sizeof(val)); return val;
}
static void LZ4_write32(void* memPtr, UInt32 value)
{
memcpy(memPtr, &value, sizeof(value));
}
int LZ4_decompress_mutate(
char* const source,
char* const dest,
int outputSize)
{
pcg64 generator;
/* Local Variables */
UInt8* ip = (UInt8*) source;
UInt8* op = (UInt8*) dest;
UInt8* const oend = op + outputSize;
UInt8* cpy;
const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};
const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
/* Main Loop : decode sequences */
while (1) {
size_t length;
const UInt8* match;
size_t offset;
/* get literal length */
unsigned const token = *ip++;
if ((length=(token>>ML_BITS)) == RUN_MASK) {
unsigned s;
do {
s = *ip++;
length += s;
} while (s==255);
}
/* copy literals */
cpy = op+length;
if (cpy>oend-WILDCOPYLENGTH)
{
if (cpy != oend) goto _output_error; /* Error : block decoding must stop exactly there */
mutate(generator, ip, length);
memcpy(op, ip, length);
ip += length;
op += length;
break; /* Necessarily EOF, due to parsing restrictions */
}
mutate(generator, ip, cpy - op);
LZ4_wildCopy(op, ip, cpy);
ip += length; op = cpy;
/* get offset */
offset = LZ4_read16(ip); ip+=2;
match = op - offset;
LZ4_write32(op, (UInt32)offset); /* costs ~1%; silence an msan warning when offset==0 */
/* get matchlength */
length = token & ML_MASK;
if (length == ML_MASK) {
unsigned s;
do {
s = *ip++;
length += s;
} while (s==255);
}
length += MINMATCH;
/* copy match within block */
cpy = op + length;
if (unlikely(offset<8)) {
const int dec64 = dec64table[offset];
op[0] = match[0];
op[1] = match[1];
op[2] = match[2];
op[3] = match[3];
match += dec32table[offset];
memcpy(op+4, match, 4);
match -= dec64;
} else { LZ4_copy8(op, match); match+=8; }
op += 8;
if (unlikely(cpy>oend-12)) {
UInt8* const oCopyLimit = oend-(WILDCOPYLENGTH-1);
if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
if (op < oCopyLimit) {
LZ4_wildCopy(op, match, oCopyLimit);
match += oCopyLimit - op;
op = oCopyLimit;
}
while (op<cpy) *op++ = *match++;
} else {
LZ4_copy8(op, match);
if (length>16) LZ4_wildCopy(op+8, match+8, cpy);
}
op=cpy; /* correction */
}
return (int) (((const char*)ip)-source); /* Nb of input bytes read */
/* Overflow error detected */
_output_error:
return (int) (-(((const char*)ip)-source))-1;
}
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_COMPRESSION_METHOD;
extern const int TOO_LARGE_SIZE_COMPRESSED;
extern const int CANNOT_DECOMPRESS;
}
class MutatingCompressedReadBufferBase
{
protected:
ReadBuffer * compressed_in;
/// If 'compressed_in' buffer has whole compressed block - then use it. Otherwise copy parts of data to 'own_compressed_buffer'.
PODArray<char> own_compressed_buffer;
/// Points to memory, holding compressed block.
char * compressed_buffer = nullptr;
size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum)
{
if (compressed_in->eof())
return 0;
CityHash_v1_0_2::uint128 checksum;
compressed_in->readStrict(reinterpret_cast<char *>(&checksum), sizeof(checksum));
own_compressed_buffer.resize(COMPRESSED_BLOCK_HEADER_SIZE);
compressed_in->readStrict(&own_compressed_buffer[0], COMPRESSED_BLOCK_HEADER_SIZE);
UInt8 method = own_compressed_buffer[0]; /// See CompressedWriteBuffer.h
size_t & size_compressed = size_compressed_without_checksum;
if (method == static_cast<UInt8>(CompressionMethodByte::LZ4) ||
method == static_cast<UInt8>(CompressionMethodByte::ZSTD) ||
method == static_cast<UInt8>(CompressionMethodByte::NONE))
{
size_compressed = unalignedLoad<UInt32>(&own_compressed_buffer[1]);
size_decompressed = unalignedLoad<UInt32>(&own_compressed_buffer[5]);
}
else
throw Exception("Unknown compression method: " + toString(method), ErrorCodes::UNKNOWN_COMPRESSION_METHOD);
if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
throw Exception("Too large size_compressed. Most likely corrupted data.", ErrorCodes::TOO_LARGE_SIZE_COMPRESSED);
/// Is whole compressed block located in 'compressed_in' buffer?
if (compressed_in->offset() >= COMPRESSED_BLOCK_HEADER_SIZE &&
compressed_in->position() + size_compressed - COMPRESSED_BLOCK_HEADER_SIZE <= compressed_in->buffer().end())
{
compressed_in->position() -= COMPRESSED_BLOCK_HEADER_SIZE;
compressed_buffer = compressed_in->position();
compressed_in->position() += size_compressed;
}
else
{
own_compressed_buffer.resize(size_compressed);
compressed_buffer = &own_compressed_buffer[0];
compressed_in->readStrict(compressed_buffer + COMPRESSED_BLOCK_HEADER_SIZE, size_compressed - COMPRESSED_BLOCK_HEADER_SIZE);
}
return size_compressed + sizeof(checksum);
}
void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
{
UInt8 method = compressed_buffer[0]; /// See CompressedWriteBuffer.h
if (method == static_cast<UInt8>(CompressionMethodByte::LZ4))
{
if (LZ4_decompress_mutate(compressed_buffer + COMPRESSED_BLOCK_HEADER_SIZE, to, size_decompressed) < 0)
throw Exception("Cannot LZ4_decompress_fast", ErrorCodes::CANNOT_DECOMPRESS);
}
else
throw Exception("Unknown compression method: " + toString(method), ErrorCodes::UNKNOWN_COMPRESSION_METHOD);
}
public:
/// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
MutatingCompressedReadBufferBase(ReadBuffer * in = nullptr)
: compressed_in(in), own_compressed_buffer(COMPRESSED_BLOCK_HEADER_SIZE)
{
}
};
class MutatingCompressedReadBuffer : public MutatingCompressedReadBufferBase, public BufferWithOwnMemory<ReadBuffer>
{
private:
size_t size_compressed = 0;
bool nextImpl() override
{
size_t size_decompressed;
size_t size_compressed_without_checksum;
size_compressed = readCompressedData(size_decompressed, size_compressed_without_checksum);
if (!size_compressed)
return false;
memory.resize(size_decompressed);
working_buffer = Buffer(&memory[0], &memory[size_decompressed]);
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
return true;
}
public:
MutatingCompressedReadBuffer(ReadBuffer & in_)
: MutatingCompressedReadBufferBase(&in_), BufferWithOwnMemory<ReadBuffer>(0)
{
}
};
}
int main(int, char **)
try
{
DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
DB::MutatingCompressedReadBuffer mutating_in(in);
DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO);
DB::copyData(mutating_in, out);
return 0;
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(true);
return DB::getCurrentExceptionCode();
}

View File

@ -113,7 +113,7 @@ int mainImpl(int argc, char ** argv)
for (int i = 0; argv[2][i]; ++i)
{
char c = argv[2][i];
switch(c)
switch (c)
{
case 'r':
mode |= MODE_READ;

View File

@ -15,9 +15,6 @@ int main(int, char **) { return 0; }
#include <Common/Stopwatch.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadHelpers.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>