Use pcg/randomSeed() random generator/seed instead of std::mt19937/std::random_device

This commit is contained in:
Robert Schulze 2023-11-07 18:49:45 +00:00
parent 4db2e25ca2
commit e46dbcb2f1
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
12 changed files with 434 additions and 424 deletions

View File

@ -5,6 +5,7 @@
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/randomSeed.h>
#include <Common/setThreadName.h>
#include <Common/CurrentMetrics.h>
#include <Interpreters/InterpreterInsertQuery.h>
@ -59,7 +60,7 @@ void ClusterCopier::init()
getContext()->setClustersConfig(task_cluster_current_config, false, task_cluster->clusters_prefix);
/// Set up shards and their priority
task_cluster->random_engine.seed(task_cluster->random_device());
task_cluster->random_engine.seed(randomSeed());
for (auto & task_table : task_cluster->table_tasks)
{
task_table.cluster_pull = getContext()->getCluster(task_table.cluster_pull_name);

View File

@ -7,7 +7,7 @@
#include <Poco/Util/AbstractConfiguration.h>
#include <random>
#include <pcg_random.hpp>
namespace DB
{
@ -45,7 +45,6 @@ struct TaskCluster
/// Subtasks
TasksTable table_tasks;
std::random_device random_device;
pcg64 random_engine;
};

View File

@ -7,6 +7,7 @@
#include <Common/logger_useful.h>
#include <Common/randomSeed.h>
#include "Coordination/KeeperConstants.h"
#include <pcg_random.hpp>
namespace DB
{
@ -42,7 +43,7 @@ public:
}
private:
std::mt19937_64 rndgen;
pcg64_fast rndgen;
std::bernoulli_distribution distribution;
};

View File

@ -2,7 +2,9 @@
#include <ctime>
#include <random>
#include <thread>
#include <pcg_random.hpp>
#include <mysqlxx/PoolWithFailover.h>
#include <Common/randomSeed.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
@ -44,10 +46,7 @@ PoolWithFailover::PoolWithFailover(
/// PoolWithFailover objects are stored in a cache inside PoolFactory.
/// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES}
/// which triggers massive re-constructing of connection pools.
/// The state of PRNGs like std::mt19937 is considered to be quite heavy
/// thus here we attempt to optimize its construction.
static thread_local std::mt19937 rnd_generator(static_cast<uint_fast32_t>(
std::hash<std::thread::id>{}(std::this_thread::get_id()) + std::clock()));
static thread_local pcg64_fast rnd_generator(randomSeed());
for (auto & [_, replicas] : replicas_by_priority)
{
if (replicas.size() > 1)

View File

@ -13,10 +13,10 @@
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_CLOCK_GETTIME;
}
namespace ErrorCodes
{
extern const int CANNOT_CLOCK_GETTIME;
}
}

View File

@ -6,6 +6,7 @@
#include <Compression/CompressionFactory.h>
#include <Compression/CompressionInfo.h>
#include <Poco/Logger.h>
#include <Common/randomSeed.h>
#include <Common/logger_useful.h>
#include "libaccel_config.h"
#include <Common/MemorySanitizer.h>
@ -29,7 +30,7 @@ DeflateQplJobHWPool & DeflateQplJobHWPool::instance()
DeflateQplJobHWPool::DeflateQplJobHWPool()
: max_hw_jobs(0)
, random_engine(std::random_device()())
, random_engine(randomSeed())
{
Poco::Logger * log = &Poco::Logger::get("DeflateQplJobHWPool");
const char * qpl_version = qpl_get_library_version();

View File

@ -3,6 +3,7 @@
#include <Compression/ICompressionCodec.h>
#include <map>
#include <random>
#include <pcg_random.hpp>
#include <qpl/qpl.h>
namespace Poco
@ -41,7 +42,7 @@ private:
std::unique_ptr<std::atomic_bool[]> hw_job_ptr_locks;
bool job_pool_ready;
std::mt19937 random_engine;
pcg64_fast random_engine;
std::uniform_int_distribution<int> distribution;
};

View File

@ -26,12 +26,14 @@
#include <Interpreters/executeQuery.h>
#include <Storages/StorageMergeTree.h>
#include <Common/quoteString.h>
#include <Common/randomSeed.h>
#include <Common/setThreadName.h>
#include <base/sleep.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <Parsers/CommonParsers.h>
#include <Parsers/ASTIdentifier.h>
#include <pcg_random.hpp>
namespace DB
{
@ -428,9 +430,8 @@ static inline void dumpDataForTables(
static inline UInt32 randomNumber()
{
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
pcg64_fast rng{randomSeed()};
std::uniform_int_distribution<pcg64_fast::result_type> dist6(
std::numeric_limits<UInt32>::min(), std::numeric_limits<UInt32>::max());
return static_cast<UInt32>(dist6(rng));
}

View File

@ -4,6 +4,7 @@
#include <Common/isLocalAddress.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/parseAddress.h>
#include <Common/randomSeed.h>
#include <Common/Config/AbstractConfigurationComparison.h>
#include <Common/Config/ConfigHelper.h>
#include <Core/Settings.h>
@ -16,6 +17,7 @@
#include <boost/range/algorithm_ext/erase.hpp>
#include <span>
#include <pcg_random.hpp>
namespace DB
{
@ -660,8 +662,7 @@ namespace
void shuffleReplicas(std::vector<Cluster::Address> & replicas, const Settings & settings, size_t replicas_needed)
{
std::random_device rd;
std::mt19937 gen{rd()};
pcg64_fast gen{randomSeed()};
if (settings.prefer_localhost_replica)
{

View File

@ -14,24 +14,25 @@
/**
#include <Common/randomSeed.h>
#include <fstream>
#include <random>
#include <pcg_random.hpp>
using namespace std;
int main()
{
std::string s;
std::random_device dev;
std::mt19937 rng(dev());
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 25);
std::binomial_distribution<std::mt19937::result_type> binomial1(100, 0.01);
std::binomial_distribution<std::mt19937::result_type> binomial2(100, 0.02);
std::binomial_distribution<std::mt19937::result_type> binomial4(100, 0.04);
std::binomial_distribution<std::mt19937::result_type> binomial8(100, 0.08);
std::binomial_distribution<std::mt19937::result_type> binomial16(100, 0.16);
std::binomial_distribution<std::mt19937::result_type> binomial24(100, 0.24);
std::binomial_distribution<std::mt19937::result_type> binomial48(100, 0.48);
pcg64_fast rng{randomSeed()};
std::uniform_int_distribution<pcg64_fast::result_type> dist(0, 25);
std::binomial_distribution<pcg64_fast::result_type> binomial1(100, 0.01);
std::binomial_distribution<pcg64_fast::result_type> binomial2(100, 0.02);
std::binomial_distribution<pcg64_fast::result_type> binomial4(100, 0.04);
std::binomial_distribution<pcg64_fast::result_type> binomial8(100, 0.08);
std::binomial_distribution<pcg64_fast::result_type> binomial16(100, 0.16);
std::binomial_distribution<pcg64_fast::result_type> binomial24(100, 0.24);
std::binomial_distribution<pcg64_fast::result_type> binomial48(100, 0.48);
// 11GB
std::ofstream f("/tmp/terms.csv");
size_t l1, l2, l4, l8, l16, l24, l48;

View File

@ -7,9 +7,10 @@
#include "PostgreSQLHandler.h"
#include <Parsers/parseQuery.h>
#include <Server/TCPServer.h>
#include <Common/randomSeed.h>
#include <Common/setThreadName.h>
#include <base/scope_guard.h>
#include <random>
#include <pcg_random.hpp>
#include "config_version.h"
@ -284,8 +285,7 @@ void PostgreSQLHandler::processQuery()
if (!parse_res.second)
throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse and execute the following part of query: {}", String(parse_res.first));
std::random_device rd;
std::mt19937 gen(rd());
pcg64_fast gen{randomSeed()};
std::uniform_int_distribution<Int32> dis(0, INT32_MAX);
for (const auto & spl_query : queries)

View File

@ -16,404 +16,409 @@ LC_ALL="en_US.UTF-8"
ROOT_PATH=$(git rev-parse --show-toplevel)
EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml'
# From [1]:
# But since array_to_string_internal() in array.c still loops over array
# elements and concatenates them into a string, it's probably not more
# efficient than the looping solutions proposed, but it's more readable.
# # From [1]:
# # But since array_to_string_internal() in array.c still loops over array
# # elements and concatenates them into a string, it's probably not more
# # efficient than the looping solutions proposed, but it's more readable.
# #
# # [1]: https://stackoverflow.com/a/15394738/328260
# function in_array()
# {
# local IFS="|"
# local value=$1 && shift
#
# [1]: https://stackoverflow.com/a/15394738/328260
function in_array()
{
local IFS="|"
local value=$1 && shift
[[ "${IFS}${*}${IFS}" =~ "${IFS}${value}${IFS}" ]]
}
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' |
# a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | missing whitespace after for/if/while... before opening brace | whitespaces inside braces
grep -v -P '(//|:\s+\*|\$\(\()| \)"'
# single-line comment | continuation of a multiline comment | a typical piece of embedded shell code | something like ending of raw string literal
# Tabs
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
xargs grep $@ -F $'\t'
# // namespace comments are unneeded
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
xargs grep $@ -P '}\s*//+\s*namespace\s*'
# Broken symlinks
find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found"
# Double whitespaces
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
grep -vP $EXCLUDE_DIRS |
while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
# Unused/Undefined/Duplicates ErrorCodes/ProfileEvents/CurrentMetrics
declare -A EXTERN_TYPES
EXTERN_TYPES[ErrorCodes]=int
EXTERN_TYPES[ProfileEvents]=Event
EXTERN_TYPES[CurrentMetrics]=Metric
EXTERN_TYPES_EXCLUDES=(
ProfileEvents::global_counters
ProfileEvents::Event
ProfileEvents::Count
ProfileEvents::Counters
ProfileEvents::end
ProfileEvents::increment
ProfileEvents::incrementForLogMessage
ProfileEvents::getName
ProfileEvents::Type
ProfileEvents::TypeEnum
ProfileEvents::dumpToMapColumn
ProfileEvents::getProfileEvents
ProfileEvents::ThreadIdToCountersSnapshot
ProfileEvents::LOCAL_NAME
ProfileEvents::CountersIncrement
CurrentMetrics::add
CurrentMetrics::sub
CurrentMetrics::get
CurrentMetrics::set
CurrentMetrics::end
CurrentMetrics::Increment
CurrentMetrics::Metric
CurrentMetrics::values
CurrentMetrics::Value
ErrorCodes::ErrorCode
ErrorCodes::getName
ErrorCodes::increment
ErrorCodes::end
ErrorCodes::values
ErrorCodes::values[i]
ErrorCodes::getErrorCodeByName
)
for extern_type in ${!EXTERN_TYPES[@]}; do
type_of_extern=${EXTERN_TYPES[$extern_type]}
allowed_chars='[_A-Za-z]+'
# Unused
# NOTE: to fix automatically, replace echo with:
# sed -i "/extern const $type_of_extern $val/d" $file
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
# NOTE: the check is pretty dumb and distinguish only by the type_of_extern,
# and this matches with zkutil::CreateMode
grep -v 'src/Common/ZooKeeper/Types.h'
} | {
grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars"
} | while read file; do
grep -P "extern const $type_of_extern $allowed_chars;" $file | sed -r -e "s/^.*?extern const $type_of_extern ($allowed_chars);.*?$/\1/" | while read val; do
if ! grep -q "$extern_type::$val" $file; then
# Excludes for SOFTWARE_EVENT/HARDWARE_EVENT/CACHE_EVENT in ThreadProfileEvents.cpp
if [[ ! $extern_type::$val =~ ProfileEvents::Perf.* ]]; then
echo "$extern_type::$val is defined but not used in file $file"
fi
fi
done
done
# Undefined
# NOTE: to fix automatically, replace echo with:
# ( grep -q -F 'namespace $extern_type' $file && \
# sed -i -r "0,/(\s*)extern const $type_of_extern [$allowed_chars]+/s//\1extern const $type_of_extern $val;\n&/" $file || \
# awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace $extern_type\n{\n extern const $type_of_extern '$val';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file )
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars"
} | while read file; do
grep -P "$extern_type::$allowed_chars" $file | grep -P -v '^\s*//' | sed -r -e "s/^.*?$extern_type::($allowed_chars).*?$/\1/" | while read val; do
if ! grep -q "extern const $type_of_extern $val" $file; then
if ! in_array "$extern_type::$val" "${EXTERN_TYPES_EXCLUDES[@]}"; then
echo "$extern_type::$val is used in file $file but not defined"
fi
fi
done
done
# Duplicates
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars"
} | while read file; do
grep -P "extern const $type_of_extern $allowed_chars;" $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate $extern_type in file $file"
done
done
# Three or more consecutive empty lines
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done
# Broken XML files (requires libxml2-utils)
find $ROOT_PATH/{src,base,programs,utils} -name '*.xml' |
grep -vP $EXCLUDE_DIRS |
xargs xmllint --noout --nonet
# FIXME: for now only clickhouse-test
pylint --rcfile=$ROOT_PATH/.pylintrc --persistent=no --score=n $ROOT_PATH/tests/clickhouse-test $ROOT_PATH/tests/ci/*.py
find $ROOT_PATH -not -path $ROOT_PATH'/contrib*' \( -name '*.yaml' -or -name '*.yml' \) -type f |
grep -vP $EXCLUDE_DIRS |
xargs yamllint --config-file=$ROOT_PATH/.yamllint
# Tests should not be named with "fail" in their names. It makes looking at the results less convenient.
find $ROOT_PATH/tests/queries -iname '*fail*' |
grep . && echo 'Tests should not be named with "fail" in their names. It makes looking at the results less convenient when you search for "fail" substring in browser.'
# Queries to system.query_log/system.query_thread_log should have current_database = currentDatabase() condition
# NOTE: it is not that accurate, but at least something.
tests_with_query_log=( $(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u
) )
for test_case in "${tests_with_query_log[@]}"; do
grep -qE current_database.*currentDatabase "$test_case" || {
grep -qE 'current_database.*\$CLICKHOUSE_DATABASE' "$test_case"
} || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case"
done
# There shouldn't be large jumps between test numbers (since they should be consecutive)
max_diff=$(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
grep -oP '\d+\D+\K\d+' | sort -n -u | awk 's{print ($0-s) " diff " s " and " $0 }{s=$0}' | sort -n | tail -n 1
)
max_diff_value=( $(echo $max_diff) )
if [[ $max_diff_value -ge 100 ]];
then
echo "Too big of a difference between test numbers: $max_diff"
fi
# Queries to:
tables_with_database_column=(
system.tables
system.parts
system.detached_parts
system.parts_columns
system.columns
system.projection_parts
system.mutations
)
# should have database = currentDatabase() condition
# [[ "${IFS}${*}${IFS}" =~ "${IFS}${value}${IFS}" ]]
# }
#
# NOTE: it is not that accuate, but at least something.
tests_with_database_column=( $(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") |
grep -v -e ':--' -e ':#' |
cut -d: -f1 | sort -u
) )
for test_case in "${tests_with_database_column[@]}"; do
grep -qE database.*currentDatabase "$test_case" || {
grep -qE 'database.*\$CLICKHOUSE_DATABASE' "$test_case"
} || {
# explicit database
grep -qE "database[ ]*=[ ]*'" "$test_case"
} || {
echo "Queries to ${tables_with_database_column[*]} does not have database = currentDatabase()/\$CLICKHOUSE_DATABASE condition in $test_case"
}
done
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
# grep -vP $EXCLUDE_DIRS |
# xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' |
# # a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | missing whitespace after for/if/while... before opening brace | whitespaces inside braces
# grep -v -P '(//|:\s+\*|\$\(\()| \)"'
# # single-line comment | continuation of a multiline comment | a typical piece of embedded shell code | something like ending of raw string literal
#
# # Tabs
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
# grep -vP $EXCLUDE_DIRS |
# xargs grep $@ -F $'\t'
#
# # // namespace comments are unneeded
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
# grep -vP $EXCLUDE_DIRS |
# xargs grep $@ -P '}\s*//+\s*namespace\s*'
#
# # Broken symlinks
# find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found"
#
# # Double whitespaces
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null |
# grep -vP $EXCLUDE_DIRS |
# while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
#
# # Unused/Undefined/Duplicates ErrorCodes/ProfileEvents/CurrentMetrics
# declare -A EXTERN_TYPES
# EXTERN_TYPES[ErrorCodes]=int
# EXTERN_TYPES[ProfileEvents]=Event
# EXTERN_TYPES[CurrentMetrics]=Metric
#
# EXTERN_TYPES_EXCLUDES=(
# ProfileEvents::global_counters
# ProfileEvents::Event
# ProfileEvents::Count
# ProfileEvents::Counters
# ProfileEvents::end
# ProfileEvents::increment
# ProfileEvents::incrementForLogMessage
# ProfileEvents::getName
# ProfileEvents::Type
# ProfileEvents::TypeEnum
# ProfileEvents::dumpToMapColumn
# ProfileEvents::getProfileEvents
# ProfileEvents::ThreadIdToCountersSnapshot
# ProfileEvents::LOCAL_NAME
# ProfileEvents::CountersIncrement
#
# CurrentMetrics::add
# CurrentMetrics::sub
# CurrentMetrics::get
# CurrentMetrics::set
# CurrentMetrics::end
# CurrentMetrics::Increment
# CurrentMetrics::Metric
# CurrentMetrics::values
# CurrentMetrics::Value
#
# ErrorCodes::ErrorCode
# ErrorCodes::getName
# ErrorCodes::increment
# ErrorCodes::end
# ErrorCodes::values
# ErrorCodes::values[i]
# ErrorCodes::getErrorCodeByName
# )
# for extern_type in ${!EXTERN_TYPES[@]}; do
# type_of_extern=${EXTERN_TYPES[$extern_type]}
# allowed_chars='[_A-Za-z]+'
#
# # Unused
# # NOTE: to fix automatically, replace echo with:
# # sed -i "/extern const $type_of_extern $val/d" $file
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
# # NOTE: the check is pretty dumb and distinguish only by the type_of_extern,
# # and this matches with zkutil::CreateMode
# grep -v 'src/Common/ZooKeeper/Types.h'
# } | {
# grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars"
# } | while read file; do
# grep -P "extern const $type_of_extern $allowed_chars;" $file | sed -r -e "s/^.*?extern const $type_of_extern ($allowed_chars);.*?$/\1/" | while read val; do
# if ! grep -q "$extern_type::$val" $file; then
# # Excludes for SOFTWARE_EVENT/HARDWARE_EVENT/CACHE_EVENT in ThreadProfileEvents.cpp
# if [[ ! $extern_type::$val =~ ProfileEvents::Perf.* ]]; then
# echo "$extern_type::$val is defined but not used in file $file"
# fi
# fi
# done
# done
#
# # Undefined
# # NOTE: to fix automatically, replace echo with:
# # ( grep -q -F 'namespace $extern_type' $file && \
# # sed -i -r "0,/(\s*)extern const $type_of_extern [$allowed_chars]+/s//\1extern const $type_of_extern $val;\n&/" $file || \
# # awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace $extern_type\n{\n extern const $type_of_extern '$val';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file )
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
# grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars"
# } | while read file; do
# grep -P "$extern_type::$allowed_chars" $file | grep -P -v '^\s*//' | sed -r -e "s/^.*?$extern_type::($allowed_chars).*?$/\1/" | while read val; do
# if ! grep -q "extern const $type_of_extern $val" $file; then
# if ! in_array "$extern_type::$val" "${EXTERN_TYPES_EXCLUDES[@]}"; then
# echo "$extern_type::$val is used in file $file but not defined"
# fi
# fi
# done
# done
#
# # Duplicates
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | {
# grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars"
# } | while read file; do
# grep -P "extern const $type_of_extern $allowed_chars;" $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate $extern_type in file $file"
# done
# done
#
# # Three or more consecutive empty lines
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done
#
# # Broken XML files (requires libxml2-utils)
# find $ROOT_PATH/{src,base,programs,utils} -name '*.xml' |
# grep -vP $EXCLUDE_DIRS |
# xargs xmllint --noout --nonet
#
# # FIXME: for now only clickhouse-test
# pylint --rcfile=$ROOT_PATH/.pylintrc --persistent=no --score=n $ROOT_PATH/tests/clickhouse-test $ROOT_PATH/tests/ci/*.py
#
# find $ROOT_PATH -not -path $ROOT_PATH'/contrib*' \( -name '*.yaml' -or -name '*.yml' \) -type f |
# grep -vP $EXCLUDE_DIRS |
# xargs yamllint --config-file=$ROOT_PATH/.yamllint
#
# # Tests should not be named with "fail" in their names. It makes looking at the results less convenient.
# find $ROOT_PATH/tests/queries -iname '*fail*' |
# grep . && echo 'Tests should not be named with "fail" in their names. It makes looking at the results less convenient when you search for "fail" substring in browser.'
#
# # Queries to system.query_log/system.query_thread_log should have current_database = currentDatabase() condition
# # NOTE: it is not that accurate, but at least something.
# tests_with_query_log=( $(
# find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
# xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u
# ) )
# for test_case in "${tests_with_query_log[@]}"; do
# grep -qE current_database.*currentDatabase "$test_case" || {
# grep -qE 'current_database.*\$CLICKHOUSE_DATABASE' "$test_case"
# } || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case"
# done
#
# # There shouldn't be large jumps between test numbers (since they should be consecutive)
# max_diff=$(
# find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
# grep -oP '\d+\D+\K\d+' | sort -n -u | awk 's{print ($0-s) " diff " s " and " $0 }{s=$0}' | sort -n | tail -n 1
# )
# max_diff_value=( $(echo $max_diff) )
# if [[ $max_diff_value -ge 100 ]];
# then
# echo "Too big of a difference between test numbers: $max_diff"
# fi
#
# # Queries to:
# tables_with_database_column=(
# system.tables
# system.parts
# system.detached_parts
# system.parts_columns
# system.columns
# system.projection_parts
# system.mutations
# )
# # should have database = currentDatabase() condition
# #
# # NOTE: it is not that accuate, but at least something.
# tests_with_database_column=( $(
# find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
# xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") |
# grep -v -e ':--' -e ':#' |
# cut -d: -f1 | sort -u
# ) )
# for test_case in "${tests_with_database_column[@]}"; do
# grep -qE database.*currentDatabase "$test_case" || {
# grep -qE 'database.*\$CLICKHOUSE_DATABASE' "$test_case"
# } || {
# # explicit database
# grep -qE "database[ ]*=[ ]*'" "$test_case"
# } || {
# echo "Queries to ${tables_with_database_column[*]} does not have database = currentDatabase()/\$CLICKHOUSE_DATABASE condition in $test_case"
# }
# done
#
# # Queries with ReplicatedMergeTree
# # NOTE: it is not that accuate, but at least something.
# tests_with_replicated_merge_tree=( $(
# find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
# xargs grep --with-filename -e "Replicated.*MergeTree[ ]*(.*" | cut -d: -f1 | sort -u
# ) )
# for test_case in "${tests_with_replicated_merge_tree[@]}"; do
# case "$test_case" in
# *.gen.*)
# ;;
# *.sh)
# test_case_zk_prefix="\(\$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX\|{database}\)"
# grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)"
# ;;
# *.sql|*.sql.j2)
# test_case_zk_prefix="\({database}\|currentDatabase()\|{uuid}\|{default_path_test}\)"
# grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)"
# ;;
# *.py)
# # Right now there is not such tests anyway
# echo "No ReplicatedMergeTree style check for *.py ($test_case)"
# ;;
# esac
# done
#
# # All the submodules should be from https://github.com/
# find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done
#
# # There shouldn't be any code snippets under GPL or LGPL
# find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL"
#
# # There shouldn't be any docker containers outside docker directory
# find $ROOT_PATH -not -path $ROOT_PATH'/tests/ci*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' -name Dockerfile -type f 2>/dev/null | xargs --no-run-if-empty -n1 echo "Please move Dockerfile to docker directory:"
#
# # There shouldn't be any docker compose files outside docker directory
# find $ROOT_PATH -name '*compose*.yml' -type f -not -path $ROOT_PATH'/docker' -not -path $ROOT_PATH'/tests/integration*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' 2>/dev/null | grep -vP $EXCLUDE_DIRS | xargs --no-run-if-empty grep -l "version:" | xargs --no-run-if-empty -n1 echo "Please move docker compose to the 'docker' or 'tests' directory:"
#
# # Check that every header file has #pragma once in first line
# find $ROOT_PATH/{src,programs,utils} -name '*.h' |
# grep -vP $EXCLUDE_DIRS |
# while read file; do [[ $(head -n1 $file) != '#pragma once' ]] && echo "File $file must have '#pragma once' in first line"; done
#
# # Check for executable bit on non-executable files
# find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} '(' -name '*.cpp' -or -name '*.h' -or -name '*.sql' -or -name '*.j2' -or -name '*.xml' -or -name '*.reference' -or -name '*.txt' -or -name '*.md' ')' -and -executable | grep -P '.' && echo "These files should not be executable."
#
# # Check for BOM
# find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xEF\xBB\xBF' | grep -P '.' && echo "Files should not have UTF-8 BOM"
# find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFF\xFE' | grep -P '.' && echo "Files should not have UTF-16LE BOM"
# find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFE\xFF' | grep -P '.' && echo "Files should not have UTF-16BE BOM"
#
# # Too many exclamation marks
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)."
#
# # Trailing whitespaces
# find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -n -P ' $' | grep -n -P '.' && echo "^ Trailing whitespaces."
#
# # Forbid stringstream because it's easy to use them incorrectly and hard to debug possible issues
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -P 'std::[io]?stringstream' | grep -v "STYLE_CHECK_ALLOW_STD_STRING_STREAM" && echo "Use WriteBufferFromOwnString or ReadBufferFromString instead of std::stringstream"
#
# # Forbid std::cerr/std::cout in src (fine in programs/utils)
# std_cerr_cout_excludes=(
# /examples/
# /tests/
# _fuzzer
# # DUMP()
# base/base/iostream_debug_helpers.h
# # OK
# src/Common/ProgressIndication.cpp
# # only under #ifdef DBMS_HASH_MAP_DEBUG_RESIZES, that is used only in tests
# src/Common/HashTable/HashTable.h
# # SensitiveDataMasker::printStats()
# src/Common/SensitiveDataMasker.cpp
# # StreamStatistics::print()
# src/Compression/LZ4_decompress_faster.cpp
# # ContextSharedPart with subsequent std::terminate()
# src/Interpreters/Context.cpp
# # IProcessor::dump()
# src/Processors/IProcessor.cpp
# src/Client/ClientBase.cpp
# src/Client/LineReader.cpp
# src/Client/QueryFuzzer.cpp
# src/Client/Suggest.cpp
# src/Bridge/IBridge.cpp
# src/Daemon/BaseDaemon.cpp
# src/Loggers/Loggers.cpp
# )
# sources_with_std_cerr_cout=( $(
# find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \
# grep -vP $EXCLUDE_DIRS | \
# grep -F -v $(printf -- "-e %s " "${std_cerr_cout_excludes[@]}") | \
# xargs grep -F --with-filename -e std::cerr -e std::cout | cut -d: -f1 | sort -u
# ) )
# # Exclude comments
# for src in "${sources_with_std_cerr_cout[@]}"; do
# # suppress stderr, since it may contain warning for #pargma once in headers
# if gcc -fpreprocessed -dD -E "$src" 2>/dev/null | grep -F -q -e std::cerr -e std::cout; then
# echo "$src: uses std::cerr/std::cout"
# fi
# done
#
# # Queries with event_date should have yesterday() not today()
# #
# # NOTE: it is not that accuate, but at least something.
# tests_with_event_time_date=( $(
# find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep --with-filename -e event_time -e event_date | cut -d: -f1 | sort -u
# ) )
# for test_case in "${tests_with_event_time_date[@]}"; do
# cat "$test_case" | tr '\n' ' ' | grep -q -i -e 'WHERE.*event_date[ ]*=[ ]*today()' -e 'WHERE.*event_date[ ]*=[ ]*today()' && {
# echo "event_time/event_date should be filtered using >=yesterday() in $test_case (to avoid flakiness)"
# }
# done
#
# expect_tests=( $(find $ROOT_PATH/tests/queries -name '*.expect') )
# for test_case in "${expect_tests[@]}"; do
# pattern="^exp_internal -f \$env(CLICKHOUSE_TMP)/\$basename.debuglog 0$"
# grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
#
# if grep -q "^spawn.*CLICKHOUSE_CLIENT_BINARY$" "$test_case"; then
# pattern="^spawn.*CLICKHOUSE_CLIENT_BINARY.*--history_file$"
# grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
# fi
#
# # Otherwise expect_after/expect_before will not bail without stdin attached
# # (and actually this is a hack anyway, correct way is to use $any_spawn_id)
# pattern="-i \$any_spawn_id timeout"
# grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
# pattern="-i \$any_spawn_id eof"
# grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
# done
#
# # Conflict markers
# find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' |
# xargs grep -P '^(<<<<<<<|=======|>>>>>>>)$' | grep -P '.' && echo "Conflict markers are found in files"
#
# # Forbid subprocess.check_call(...) in integration tests because it does not provide enough information on errors
# find $ROOT_PATH'/tests/integration' -name '*.py' |
# xargs grep -F 'subprocess.check_call' | grep -v "STYLE_CHECK_ALLOW_SUBPROCESS_CHECK_CALL" && echo "Use helpers.cluster.run_and_check or subprocess.run instead of subprocess.check_call to print detailed info on error"
#
# # Forbid non-unique error codes
# if [[ "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | wc -l)" != "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | sort | uniq | wc -l)" ]]
# then
# echo "ErrorCodes.cpp contains non-unique error codes"
# fi
#
# # Check that there is no system-wide libraries/headers in use.
# #
# # NOTE: it is better to override find_path/find_library in cmake, but right now
# # it is not possible, see [1] for the reference.
# #
# # [1]: git grep --recurse-submodules -e find_library -e find_path contrib
# if git grep -e find_path -e find_library -- :**CMakeLists.txt; then
# echo "There is find_path/find_library usage. ClickHouse should use everything bundled. Consider adding one more contrib module."
# fi
#
# # Forbid files that differ only by character case
# find $ROOT_PATH | sort -f | uniq -i -c | awk '{ if ($1 > 1) print }'
#
# # Forbid std::filesystem::is_symlink and std::filesystem::read_symlink, because it's easy to use them incorrectly
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -P '::(is|read)_symlink' | grep -v "STYLE_CHECK_ALLOW_STD_FS_SYMLINK" && echo "Use DB::FS::isSymlink and DB::FS::readSymlink instead"
#
# # Forbid __builtin_unreachable(), because it's hard to debug when it becomes reachable
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead"
# Queries with ReplicatedMergeTree
# NOTE: it is not that accuate, but at least something.
tests_with_replicated_merge_tree=( $(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
xargs grep --with-filename -e "Replicated.*MergeTree[ ]*(.*" | cut -d: -f1 | sort -u
) )
for test_case in "${tests_with_replicated_merge_tree[@]}"; do
case "$test_case" in
*.gen.*)
;;
*.sh)
test_case_zk_prefix="\(\$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX\|{database}\)"
grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)"
;;
*.sql|*.sql.j2)
test_case_zk_prefix="\({database}\|currentDatabase()\|{uuid}\|{default_path_test}\)"
grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)"
;;
*.py)
# Right now there is not such tests anyway
echo "No ReplicatedMergeTree style check for *.py ($test_case)"
;;
esac
done
# All the submodules should be from https://github.com/
find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done
# There shouldn't be any code snippets under GPL or LGPL
find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL"
# There shouldn't be any docker containers outside docker directory
find $ROOT_PATH -not -path $ROOT_PATH'/tests/ci*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' -name Dockerfile -type f 2>/dev/null | xargs --no-run-if-empty -n1 echo "Please move Dockerfile to docker directory:"
# There shouldn't be any docker compose files outside docker directory
find $ROOT_PATH -name '*compose*.yml' -type f -not -path $ROOT_PATH'/docker' -not -path $ROOT_PATH'/tests/integration*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' 2>/dev/null | grep -vP $EXCLUDE_DIRS | xargs --no-run-if-empty grep -l "version:" | xargs --no-run-if-empty -n1 echo "Please move docker compose to the 'docker' or 'tests' directory:"
# Check that every header file has #pragma once in first line
find $ROOT_PATH/{src,programs,utils} -name '*.h' |
grep -vP $EXCLUDE_DIRS |
while read file; do [[ $(head -n1 $file) != '#pragma once' ]] && echo "File $file must have '#pragma once' in first line"; done
# Check for executable bit on non-executable files
find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} '(' -name '*.cpp' -or -name '*.h' -or -name '*.sql' -or -name '*.j2' -or -name '*.xml' -or -name '*.reference' -or -name '*.txt' -or -name '*.md' ')' -and -executable | grep -P '.' && echo "These files should not be executable."
# Check for BOM
find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xEF\xBB\xBF' | grep -P '.' && echo "Files should not have UTF-8 BOM"
find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFF\xFE' | grep -P '.' && echo "Files should not have UTF-16LE BOM"
find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -l -F $'\xFE\xFF' | grep -P '.' && echo "Files should not have UTF-16BE BOM"
# Too many exclamation marks
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)."
# Trailing whitespaces
find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -n -P ' $' | grep -n -P '.' && echo "^ Trailing whitespaces."
# Forbid stringstream because it's easy to use them incorrectly and hard to debug possible issues
# Forbid mt19937() and random_device() which are outdated and slow
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -P 'std::[io]?stringstream' | grep -v "STYLE_CHECK_ALLOW_STD_STRING_STREAM" && echo "Use WriteBufferFromOwnString or ReadBufferFromString instead of std::stringstream"
xargs grep -P '(std::mt19937|std::mersenne_twister_engine|std::random_device)' && echo "Use pcg64_fast (from pcg_random.h) and randomSeed (from Common/randomSeed.h) instead"
# Forbid std::cerr/std::cout in src (fine in programs/utils)
std_cerr_cout_excludes=(
/examples/
/tests/
_fuzzer
# DUMP()
base/base/iostream_debug_helpers.h
# OK
src/Common/ProgressIndication.cpp
# only under #ifdef DBMS_HASH_MAP_DEBUG_RESIZES, that is used only in tests
src/Common/HashTable/HashTable.h
# SensitiveDataMasker::printStats()
src/Common/SensitiveDataMasker.cpp
# StreamStatistics::print()
src/Compression/LZ4_decompress_faster.cpp
# ContextSharedPart with subsequent std::terminate()
src/Interpreters/Context.cpp
# IProcessor::dump()
src/Processors/IProcessor.cpp
src/Client/ClientBase.cpp
src/Client/LineReader.cpp
src/Client/QueryFuzzer.cpp
src/Client/Suggest.cpp
src/Bridge/IBridge.cpp
src/Daemon/BaseDaemon.cpp
src/Loggers/Loggers.cpp
)
sources_with_std_cerr_cout=( $(
find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \
grep -vP $EXCLUDE_DIRS | \
grep -F -v $(printf -- "-e %s " "${std_cerr_cout_excludes[@]}") | \
xargs grep -F --with-filename -e std::cerr -e std::cout | cut -d: -f1 | sort -u
) )
# Exclude comments
for src in "${sources_with_std_cerr_cout[@]}"; do
# suppress stderr, since it may contain warning for #pargma once in headers
if gcc -fpreprocessed -dD -E "$src" 2>/dev/null | grep -F -q -e std::cerr -e std::cout; then
echo "$src: uses std::cerr/std::cout"
fi
done
# Queries with event_date should have yesterday() not today()
# # Require checking return value of close(),
# # since it can hide fd misuse and break other places.
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
# grep -vP $EXCLUDE_DIRS |
# xargs grep -e ' close(.*fd' -e ' ::close(' | grep -v = && echo "Return value of close() should be checked"
#
# NOTE: it is not that accuate, but at least something.
tests_with_event_time_date=( $(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' |
grep -vP $EXCLUDE_DIRS |
xargs grep --with-filename -e event_time -e event_date | cut -d: -f1 | sort -u
) )
for test_case in "${tests_with_event_time_date[@]}"; do
cat "$test_case" | tr '\n' ' ' | grep -q -i -e 'WHERE.*event_date[ ]*=[ ]*today()' -e 'WHERE.*event_date[ ]*=[ ]*today()' && {
echo "event_time/event_date should be filtered using >=yesterday() in $test_case (to avoid flakiness)"
}
done
expect_tests=( $(find $ROOT_PATH/tests/queries -name '*.expect') )
for test_case in "${expect_tests[@]}"; do
pattern="^exp_internal -f \$env(CLICKHOUSE_TMP)/\$basename.debuglog 0$"
grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
if grep -q "^spawn.*CLICKHOUSE_CLIENT_BINARY$" "$test_case"; then
pattern="^spawn.*CLICKHOUSE_CLIENT_BINARY.*--history_file$"
grep -q "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
fi
# Otherwise expect_after/expect_before will not bail without stdin attached
# (and actually this is a hack anyway, correct way is to use $any_spawn_id)
pattern="-i \$any_spawn_id timeout"
grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
pattern="-i \$any_spawn_id eof"
grep -q -- "$pattern" "$test_case" || echo "Missing '$pattern' in '$test_case'"
done
# Conflict markers
find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' |
xargs grep -P '^(<<<<<<<|=======|>>>>>>>)$' | grep -P '.' && echo "Conflict markers are found in files"
# Forbid subprocess.check_call(...) in integration tests because it does not provide enough information on errors
find $ROOT_PATH'/tests/integration' -name '*.py' |
xargs grep -F 'subprocess.check_call' | grep -v "STYLE_CHECK_ALLOW_SUBPROCESS_CHECK_CALL" && echo "Use helpers.cluster.run_and_check or subprocess.run instead of subprocess.check_call to print detailed info on error"
# Forbid non-unique error codes
if [[ "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | wc -l)" != "$(grep -Po "M\([0-9]*," $ROOT_PATH/src/Common/ErrorCodes.cpp | sort | uniq | wc -l)" ]]
then
echo "ErrorCodes.cpp contains non-unique error codes"
fi
# Check that there is no system-wide libraries/headers in use.
# # Check for existence of __init__.py files
# for i in "${ROOT_PATH}"/tests/integration/test_*; do FILE="${i}/__init__.py"; [ ! -f "${FILE}" ] && echo "${FILE} should exist for every integration test"; done
#
# NOTE: it is better to override find_path/find_library in cmake, but right now
# it is not possible, see [1] for the reference.
# # A small typo can lead to debug code in release builds, see https://github.com/ClickHouse/ClickHouse/pull/47647
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -F '#ifdef NDEBUG' | xargs -I@FILE awk '/#ifdef NDEBUG/ { inside = 1; dirty = 1 } /#endif/ { if (inside && dirty) { print "File @FILE has suspicious #ifdef NDEBUG, possibly confused with #ifndef NDEBUG" }; inside = 0 } /#else/ { dirty = 0 }' @FILE
#
# [1]: git grep --recurse-submodules -e find_library -e find_path contrib
if git grep -e find_path -e find_library -- :**CMakeLists.txt; then
echo "There is find_path/find_library usage. ClickHouse should use everything bundled. Consider adding one more contrib module."
fi
# Forbid files that differ only by character case
find $ROOT_PATH | sort -f | uniq -i -c | awk '{ if ($1 > 1) print }'
# Forbid std::filesystem::is_symlink and std::filesystem::read_symlink, because it's easy to use them incorrectly
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -P '::(is|read)_symlink' | grep -v "STYLE_CHECK_ALLOW_STD_FS_SYMLINK" && echo "Use DB::FS::isSymlink and DB::FS::readSymlink instead"
# Forbid __builtin_unreachable(), because it's hard to debug when it becomes reachable
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead"
# Require checking return value of close(),
# since it can hide fd misuse and break other places.
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' |
grep -vP $EXCLUDE_DIRS |
xargs grep -e ' close(.*fd' -e ' ::close(' | grep -v = && echo "Return value of close() should be checked"
# Check for existence of __init__.py files
for i in "${ROOT_PATH}"/tests/integration/test_*; do FILE="${i}/__init__.py"; [ ! -f "${FILE}" ] && echo "${FILE} should exist for every integration test"; done
# A small typo can lead to debug code in release builds, see https://github.com/ClickHouse/ClickHouse/pull/47647
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -l -F '#ifdef NDEBUG' | xargs -I@FILE awk '/#ifdef NDEBUG/ { inside = 1; dirty = 1 } /#endif/ { if (inside && dirty) { print "File @FILE has suspicious #ifdef NDEBUG, possibly confused with #ifndef NDEBUG" }; inside = 0 } /#else/ { dirty = 0 }' @FILE
# If a user is doing dynamic or typeid cast with a pointer, and immediately dereferencing it, it is unsafe.
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep --line-number -P '(dynamic|typeid)_cast<[^>]+\*>\([^\(\)]+\)->' | grep -P '.' && echo "It's suspicious when you are doing a dynamic_cast or typeid_cast with a pointer and immediately dereferencing it. Use references instead of pointers or check a pointer to nullptr."
# The stateful directory should only contain the tests that depend on the test dataset (hits or visits).
find $ROOT_PATH/tests/queries/1_stateful -name '*.sql' -or -name '*.sh' | grep -v '00076_system_columns_bytes' | xargs -I{} bash -c 'grep -q -P "hits|visits" "{}" || echo "The test {} does not depend on the test dataset (hits or visits table) and should be located in the 0_stateless directory. You can also add an exception to the check-style script."'
# Check for bad punctuation: whitespace before comma.
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '\w ,' | grep -v 'bad punctuation is ok here' && echo "^ There is bad punctuation: whitespace before comma. You should write it like this: 'Hello, world!'"
# Cyrillic characters hiding inside Latin.
find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '[a-zA-Z][а-яА-ЯёЁ]|[а-яА-ЯёЁ][a-zA-Z]' && echo "^ Cyrillic characters found in unexpected place."
# # If a user is doing dynamic or typeid cast with a pointer, and immediately dereferencing it, it is unsafe.
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep --line-number -P '(dynamic|typeid)_cast<[^>]+\*>\([^\(\)]+\)->' | grep -P '.' && echo "It's suspicious when you are doing a dynamic_cast or typeid_cast with a pointer and immediately dereferencing it. Use references instead of pointers or check a pointer to nullptr."
#
# # The stateful directory should only contain the tests that depend on the test dataset (hits or visits).
# find $ROOT_PATH/tests/queries/1_stateful -name '*.sql' -or -name '*.sh' | grep -v '00076_system_columns_bytes' | xargs -I{} bash -c 'grep -q -P "hits|visits" "{}" || echo "The test {} does not depend on the test dataset (hits or visits table) and should be located in the 0_stateless directory. You can also add an exception to the check-style script."'
#
# # Check for bad punctuation: whitespace before comma.
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '\w ,' | grep -v 'bad punctuation is ok here' && echo "^ There is bad punctuation: whitespace before comma. You should write it like this: 'Hello, world!'"
#
# # Cyrillic characters hiding inside Latin.
# find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '[a-zA-Z][а-яА-ЯёЁ]|[а-яА-ЯёЁ][a-zA-Z]' && echo "^ Cyrillic characters found in unexpected place."