Fix "check style" tool

2024-12-03 13:02:00 +00:00 · 2020-04-06 02:51:26 +03:00 · 2020-04-06 02:51:26 +03:00 · 646f409b8e
commit 646f409b8e
parent 73b0f8db8c
7 changed files with 10 additions and 423 deletions
--- a/base/daemon/BaseDaemon.cpp
+++ b/base/daemon/BaseDaemon.cpp
@ -12,7 +12,6 @@
 #include <unistd.h>

 #include <typeinfo>
-#include <sys/time.h>
 #include <sys/resource.h>
 #include <iostream>
 #include <fstream>
--- a/utils/check-style/check-duplicate-includes.sh
+++ b/utils/check-style/check-duplicate-includes.sh
@ -3,4 +3,4 @@
 ROOT_PATH=$(git rev-parse --show-toplevel)

 # Find duplicate include directives
-find $ROOT_PATH/dbms -name '*.h' -or -name '*.cpp' | while read file; do grep -P '^#include ' $file | sort | uniq -c | grep -v -P '^\s+1\s' && echo $file; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | while read file; do grep -P '^#include ' $file | sort | uniq -c | grep -v -P '^\s+1\s' && echo $file; done
--- a/utils/check-style/check-style
+++ b/utils/check-style/check-style
@ -15,7 +15,7 @@
 ROOT_PATH=$(git rev-parse --show-toplevel)
 EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|memcpy/|consistent-hashing'

-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
    grep -vP $EXCLUDE_DIRS |
    xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|\t|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' |
 # a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | a tab character | missing whitespace after for/if/while... before opening brace | whitespaces inside braces
@ -23,7 +23,7 @@ find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
 # single-line comment | continuation of a multiline comment | a typical piece of embedded shell code | something like ending of raw string literal

 # // namespace comments are unneeded
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
    grep -vP $EXCLUDE_DIRS |
    xargs grep $@ -P '}\s*//+\s*namespace\s*'

@ -31,23 +31,23 @@ find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null |
 find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found"

 # Double whitespaces
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' 2>/dev/null | while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done

 # Unused ErrorCodes
 # NOTE: to fix automatically, replace echo with:
 # sed -i "/extern const int $code/d" $file
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'extern const int [_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sed -r -e 's/^.*?extern const int ([_A-Z]+);.*?$/\1/' | while read code; do grep -q "ErrorCodes::$code" $file || echo "ErrorCode $code is defined but not used in file $file"; done; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'extern const int [_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sed -r -e 's/^.*?extern const int ([_A-Z]+);.*?$/\1/' | while read code; do grep -q "ErrorCodes::$code" $file || echo "ErrorCode $code is defined but not used in file $file"; done; done

 # Undefined ErrorCodes
 # NOTE: to fix automatically, replace echo with:
 # ( grep -q -F 'namespace ErrorCodes' $file && sed -i -r "0,/(\s*)extern const int [_A-Z]+/s//\1extern const int $code;\n&/" $file || awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace ErrorCodes\n{\n    extern const int '$code';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file )
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'ErrorCodes::[_A-Z]+' $file | sed -r -e 's/^.*?ErrorCodes::([_A-Z]+).*?$/\1/' | while read code; do grep -q "extern const int $code" $file || echo "ErrorCode $code is used in file $file but not defined"; done; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'ErrorCodes::[_A-Z]+' $file | sed -r -e 's/^.*?ErrorCodes::([_A-Z]+).*?$/\1/' | while read code; do grep -q "extern const int $code" $file || echo "ErrorCode $code is used in file $file but not defined"; done; done

 # Duplicate ErrorCodes
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate ErrorCode in file $file"; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -P 'ErrorCodes::[_A-Z]+' | while read file; do grep -P 'extern const int [_A-Z]+;' $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate ErrorCode in file $file"; done

 # Three or more consecutive empty lines
-find $ROOT_PATH/{dbms,base} -name '*.h' -or -name '*.cpp' | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done
+find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done

 # Broken XML files (requires libxml2-utils)
-find $ROOT_PATH/{dbms,base} -name '*.xml' | xargs xmllint --noout --nonet
+find $ROOT_PATH/{src,base,programs,utils} -name '*.xml' | xargs xmllint --noout --nonet
--- a/utils/compressor/CMakeLists.txt
+++ b/utils/compressor/CMakeLists.txt
@ -1,5 +1,2 @@
-add_executable (mutator mutator.cpp)
-target_link_libraries(mutator PRIVATE clickhouse_common_io)
-
 add_executable (decompress_perf decompress_perf.cpp)
 target_link_libraries(decompress_perf PRIVATE dbms ${LZ4_LIBRARY})
--- a/utils/compressor/mutator.cpp
+++ b/utils/compressor/mutator.cpp
@ -1,406 +0,0 @@
-#include <string.h>
-#include <random>
-#include <pcg_random.hpp>
-#include <common/types.h>
-
-#include <IO/ReadBuffer.h>
-#include <IO/ReadBufferFromFileDescriptor.h>
-#include <IO/WriteBufferFromFileDescriptor.h>
-#include <IO/BufferWithOwnMemory.h>
-#include <Compression/CompressionInfo.h>
-#include <IO/WriteHelpers.h>
-#include <IO/copyData.h>
-#include <Common/PODArray.h>
-
-/** Quick and dirty implementation of data scrambler.
-  *
-  * The task is to replace the data with pseudorandom values.
-  * But with keeping some probability distributions
-  *  and with maintaining the same compression ratio.
-  *
-  * The solution is to operate directly on compressed LZ4 stream.
-  * The stream consists of independent compressed blocks.
-  * Each block is a stream of "literals" and "matches".
-  * Liteal is an instruction to literally put some following bytes,
-  *  and match is an instruction to copy some bytes that was already seen before.
-  *
-  * We get literals and apply some scramble operation on it.
-  * But we keep literal length and matches without changes.
-  *
-  * That's how we get pseudorandom data but with keeping
-  *  all repetitive patterns and maintaining the same compression ratio.
-  *
-  * Actually, the compression ratio, if you decompress scrambled data and compress again
-  *  become slightly worse, because LZ4 use simple match finder based on value of hash function,
-  *  and it can find different matches due to collisions in hash function.
-  *
-  * Scramble operation replace literals with pseudorandom bytes,
-  *  but with some heuristics to keep some sort of data structure.
-  *
-  * It's in question, is it scramble data enough and while is it safe to publish scrambled data.
-  * In general, you should assume that it is not safe.
-  */
-
-
-#define ML_BITS  4
-#define ML_MASK  ((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
-
-#define MINMATCH 4
-#define WILDCOPYLENGTH 8
-#define LASTLITERALS 5
-
-
-static UInt8 rand(pcg64 & generator, UInt8 min, UInt8 max)
-{
-    return min + generator() % (max + 1 - min);
-}
-
-static void mutate(pcg64 & generator, void * src, size_t length)
-{
-    UInt8 * pos = static_cast<UInt8 *>(src);
-    UInt8 * end = pos + length;
-
-    while (pos < end)
-    {
-        if (pos + strlen("https") <= end && 0 == memcmp(pos, "https", strlen("https")))
-        {
-            pos += strlen("https");
-            continue;
-        }
-
-        if (pos + strlen("http") <= end && 0 == memcmp(pos, "http", strlen("http")))
-        {
-            pos += strlen("http");
-            continue;
-        }
-
-        if (pos + strlen("www") <= end && 0 == memcmp(pos, "www", strlen("www")))
-        {
-            pos += strlen("www");
-            continue;
-        }
-
-        if (*pos >= '1' && *pos <= '9')
-            *pos = rand(generator, '1', '9');
-        else if (*pos >= 'a' && *pos <= 'z')
-            *pos = rand(generator, 'a', 'z');
-        else if (*pos >= 'A' && *pos <= 'Z')
-            *pos = rand(generator, 'A', 'Z');
-        else if (*pos >= 0x80 && *pos <= 0xBF)
-            *pos = rand(generator, *pos & 0xF0U, *pos | 0x0FU);
-        else if (*pos == '\\')
-            ++pos;
-
-        ++pos;
-    }
-
-    pos = static_cast<UInt8 *>(src);
-    while (pos < end)
-    {
-        if (pos + 3 <= end
-            && isAlphaASCII(pos[0])
-            && !isAlphaASCII(pos[1]) && pos[1] != '\\' && pos[1] >= 0x20
-            && isAlphaASCII(pos[2]))
-        {
-            auto res = rand(generator, 0, 3);
-            if (res == 2)
-            {
-                std::swap(pos[0], pos[1]);
-            }
-            else if (res == 3)
-                std::swap(pos[1], pos[2]);
-
-            pos += 3;
-        }
-        else if (pos + 5 <= end
-            && pos[0] >= 0xC0 && pos[0] <= 0xDF && pos[1] >= 0x80 && pos[1] <= 0xBF
-            && pos[2] >= 0x20 && pos[2] < 0x80 && !isAlphaASCII(pos[2])
-            && pos[3] >= 0xC0 && pos[3] <= 0xDF && pos[4] >= 0x80 && pos[4] <= 0xBF)
-        {
-            auto res = rand(generator, 0, 3);
-            if (res == 2)
-            {
-                std::swap(pos[1], pos[2]);
-                std::swap(pos[0], pos[1]);
-            }
-            else if (res == 3)
-            {
-                std::swap(pos[3], pos[2]);
-                std::swap(pos[4], pos[3]);
-            }
-
-            pos += 5;
-        }
-        else
-            ++pos;
-    }
-}
-
-
-static void LZ4_copy8(void* dst, const void* src)
-{
-    memcpy(dst,src,8);
-}
-
-/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
-static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
-{
-    UInt8* d = (UInt8*)dstPtr;
-    const UInt8* s = (const UInt8*)srcPtr;
-    UInt8* const e = (UInt8*)dstEnd;
-
-    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
-}
-
-
-static UInt16 LZ4_read16(const void* memPtr)
-{
-    UInt16 val; memcpy(&val, memPtr, sizeof(val)); return val;
-}
-
-
-static void LZ4_write32(void* memPtr, UInt32 value)
-{
-    memcpy(memPtr, &value, sizeof(value));
-}
-
-
-int LZ4_decompress_mutate(
-                 char* const source,
-                 char* const dest,
-                 int outputSize)
-{
-    pcg64 generator;
-
-    /* Local Variables */
-    UInt8* ip = (UInt8*) source;
-
-    UInt8* op = (UInt8*) dest;
-    UInt8* const oend = op + outputSize;
-    UInt8* cpy;
-
-    const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};
-    const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-
-    /* Main Loop : decode sequences */
-    while (1) {
-        size_t length;
-        const UInt8* match;
-        size_t offset;
-
-        /* get literal length */
-        unsigned const token = *ip++;
-        if ((length=(token>>ML_BITS)) == RUN_MASK) {
-            unsigned s;
-            do {
-                s = *ip++;
-                length += s;
-            } while (s==255);
-        }
-
-        /* copy literals */
-        cpy = op+length;
-        if (cpy>oend-WILDCOPYLENGTH)
-        {
-            if (cpy != oend) goto _output_error;       /* Error : block decoding must stop exactly there */
-            mutate(generator, ip, length);
-            memcpy(op, ip, length);
-            ip += length;
-            op += length;
-            break;     /* Necessarily EOF, due to parsing restrictions */
-        }
-        mutate(generator, ip, cpy - op);
-        LZ4_wildCopy(op, ip, cpy);
-        ip += length; op = cpy;
-
-        /* get offset */
-        offset = LZ4_read16(ip); ip+=2;
-        match = op - offset;
-        LZ4_write32(op, (UInt32)offset);   /* costs ~1%; silence an msan warning when offset==0 */
-
-        /* get matchlength */
-        length = token & ML_MASK;
-        if (length == ML_MASK) {
-            unsigned s;
-            do {
-                s = *ip++;
-                length += s;
-            } while (s==255);
-        }
-        length += MINMATCH;
-
-        /* copy match within block */
-        cpy = op + length;
-        if (unlikely(offset<8)) {
-            const int dec64 = dec64table[offset];
-            op[0] = match[0];
-            op[1] = match[1];
-            op[2] = match[2];
-            op[3] = match[3];
-            match += dec32table[offset];
-            memcpy(op+4, match, 4);
-            match -= dec64;
-        } else { LZ4_copy8(op, match); match+=8; }
-        op += 8;
-
-        if (unlikely(cpy>oend-12)) {
-            UInt8* const oCopyLimit = oend-(WILDCOPYLENGTH-1);
-            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
-            if (op < oCopyLimit) {
-                LZ4_wildCopy(op, match, oCopyLimit);
-                match += oCopyLimit - op;
-                op = oCopyLimit;
-            }
-            while (op<cpy) *op++ = *match++;
-        } else {
-            LZ4_copy8(op, match);
-            if (length>16) LZ4_wildCopy(op+8, match+8, cpy);
-        }
-        op=cpy;   /* correction */
-    }
-
-    return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
-
-    /* Overflow error detected */
-_output_error:
-    return (int) (-(((const char*)ip)-source))-1;
-}
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int UNKNOWN_COMPRESSION_METHOD;
-    extern const int TOO_LARGE_SIZE_COMPRESSED;
-    extern const int CANNOT_DECOMPRESS;
-}
-
-class MutatingCompressedReadBufferBase
-{
-protected:
-    ReadBuffer * compressed_in;
-
-    /// If 'compressed_in' buffer has whole compressed block - then use it. Otherwise copy parts of data to 'own_compressed_buffer'.
-    PODArray<char> own_compressed_buffer;
-    /// Points to memory, holding compressed block.
-    char * compressed_buffer = nullptr;
-
-    size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum)
-    {
-        if (compressed_in->eof())
-            return 0;
-
-        CityHash_v1_0_2::uint128 checksum;
-        compressed_in->readStrict(reinterpret_cast<char *>(&checksum), sizeof(checksum));
-
-        own_compressed_buffer.resize(COMPRESSED_BLOCK_HEADER_SIZE);
-        compressed_in->readStrict(&own_compressed_buffer[0], COMPRESSED_BLOCK_HEADER_SIZE);
-
-        UInt8 method = own_compressed_buffer[0];    /// See CompressedWriteBuffer.h
-
-        size_t & size_compressed = size_compressed_without_checksum;
-
-        if (method == static_cast<UInt8>(CompressionMethodByte::LZ4) ||
-            method == static_cast<UInt8>(CompressionMethodByte::ZSTD) ||
-            method == static_cast<UInt8>(CompressionMethodByte::NONE))
-        {
-            size_compressed = unalignedLoad<UInt32>(&own_compressed_buffer[1]);
-            size_decompressed = unalignedLoad<UInt32>(&own_compressed_buffer[5]);
-        }
-        else
-            throw Exception("Unknown compression method: " + toString(method), ErrorCodes::UNKNOWN_COMPRESSION_METHOD);
-
-        if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
-            throw Exception("Too large size_compressed. Most likely corrupted data.", ErrorCodes::TOO_LARGE_SIZE_COMPRESSED);
-
-        /// Is whole compressed block located in 'compressed_in' buffer?
-        if (compressed_in->offset() >= COMPRESSED_BLOCK_HEADER_SIZE &&
-            compressed_in->position() + size_compressed - COMPRESSED_BLOCK_HEADER_SIZE <= compressed_in->buffer().end())
-        {
-            compressed_in->position() -= COMPRESSED_BLOCK_HEADER_SIZE;
-            compressed_buffer = compressed_in->position();
-            compressed_in->position() += size_compressed;
-        }
-        else
-        {
-            own_compressed_buffer.resize(size_compressed);
-            compressed_buffer = &own_compressed_buffer[0];
-            compressed_in->readStrict(compressed_buffer + COMPRESSED_BLOCK_HEADER_SIZE, size_compressed - COMPRESSED_BLOCK_HEADER_SIZE);
-        }
-
-        return size_compressed + sizeof(checksum);
-    }
-
-    void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
-    {
-        UInt8 method = compressed_buffer[0];    /// See CompressedWriteBuffer.h
-
-        if (method == static_cast<UInt8>(CompressionMethodByte::LZ4))
-        {
-            if (LZ4_decompress_mutate(compressed_buffer + COMPRESSED_BLOCK_HEADER_SIZE, to, size_decompressed) < 0)
-                throw Exception("Cannot LZ4_decompress_fast", ErrorCodes::CANNOT_DECOMPRESS);
-        }
-        else
-            throw Exception("Unknown compression method: " + toString(method), ErrorCodes::UNKNOWN_COMPRESSION_METHOD);
-    }
-
-public:
-    /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
-    MutatingCompressedReadBufferBase(ReadBuffer * in = nullptr)
-        : compressed_in(in), own_compressed_buffer(COMPRESSED_BLOCK_HEADER_SIZE)
-    {
-    }
-};
-
-
-class MutatingCompressedReadBuffer : public MutatingCompressedReadBufferBase, public BufferWithOwnMemory<ReadBuffer>
-{
-private:
-    size_t size_compressed = 0;
-
-    bool nextImpl() override
-    {
-        size_t size_decompressed;
-        size_t size_compressed_without_checksum;
-        size_compressed = readCompressedData(size_decompressed, size_compressed_without_checksum);
-        if (!size_compressed)
-            return false;
-
-        memory.resize(size_decompressed);
-        working_buffer = Buffer(&memory[0], &memory[size_decompressed]);
-
-        decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
-
-        return true;
-    }
-
-public:
-    MutatingCompressedReadBuffer(ReadBuffer & in_)
-        : MutatingCompressedReadBufferBase(&in_), BufferWithOwnMemory<ReadBuffer>(0)
-    {
-    }
-};
-
-}
-
-
-int main(int, char **)
-try
-{
-    DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
-    DB::MutatingCompressedReadBuffer mutating_in(in);
-    DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO);
-
-    DB::copyData(mutating_in, out);
-
-    return 0;
-}
-catch (...)
-{
-    std::cerr << DB::getCurrentExceptionMessage(true);
-    return DB::getCurrentExceptionCode();
-}
--- a/utils/iotest/iotest.cpp
+++ b/utils/iotest/iotest.cpp
@ -113,7 +113,7 @@ int mainImpl(int argc, char ** argv)
    for (int i = 0; argv[2][i]; ++i)
    {
        char c = argv[2][i];
-        switch(c)
+        switch (c)
        {
            case 'r':
                mode |= MODE_READ;
--- a/utils/iotest/iotest_aio.cpp
+++ b/utils/iotest/iotest_aio.cpp
@ -15,9 +15,6 @@ int main(int, char **) { return 0; }
 #include <Common/Stopwatch.h>
 #include <IO/BufferWithOwnMemory.h>
 #include <IO/ReadHelpers.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <stdlib.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>