Merge branch 'master' into no-keep-context-lock-while-calculating-access

This commit is contained in:
Vitaly Baranov 2023-07-18 07:35:39 +02:00 committed by GitHub
commit 4963cfba39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 386 additions and 159 deletions

View File

@ -1,43 +1,38 @@
# Usage:
# set (MAX_COMPILER_MEMORY 2000 CACHE INTERNAL "") # In megabytes
# set (MAX_LINKER_MEMORY 3500 CACHE INTERNAL "")
# include (cmake/limit_jobs.cmake)
# Limit compiler/linker job concurrency to avoid OOMs on subtrees where compilation/linking is memory-intensive.
#
# Usage from CMake:
# set (MAX_COMPILER_MEMORY 2000 CACHE INTERNAL "") # megabyte
# set (MAX_LINKER_MEMORY 3500 CACHE INTERNAL "") # megabyte
# include (cmake/limit_jobs.cmake)
#
# (bigger values mean fewer jobs)
cmake_host_system_information(RESULT TOTAL_PHYSICAL_MEMORY QUERY TOTAL_PHYSICAL_MEMORY) # Not available under freebsd
cmake_host_system_information(RESULT TOTAL_PHYSICAL_MEMORY QUERY TOTAL_PHYSICAL_MEMORY)
cmake_host_system_information(RESULT NUMBER_OF_LOGICAL_CORES QUERY NUMBER_OF_LOGICAL_CORES)
# 1 if not set
option(PARALLEL_COMPILE_JOBS "Maximum number of concurrent compilation jobs" "")
# Set to disable the automatic job-limiting
option(PARALLEL_COMPILE_JOBS "Maximum number of concurrent compilation jobs" OFF)
option(PARALLEL_LINK_JOBS "Maximum number of concurrent link jobs" OFF)
# 1 if not set
option(PARALLEL_LINK_JOBS "Maximum number of concurrent link jobs" "")
if (NOT PARALLEL_COMPILE_JOBS AND TOTAL_PHYSICAL_MEMORY AND MAX_COMPILER_MEMORY)
if (NOT PARALLEL_COMPILE_JOBS AND MAX_COMPILER_MEMORY)
math(EXPR PARALLEL_COMPILE_JOBS ${TOTAL_PHYSICAL_MEMORY}/${MAX_COMPILER_MEMORY})
if (NOT PARALLEL_COMPILE_JOBS)
set (PARALLEL_COMPILE_JOBS 1)
endif ()
if (NOT NUMBER_OF_LOGICAL_CORES OR PARALLEL_COMPILE_JOBS LESS NUMBER_OF_LOGICAL_CORES)
set (PARALLEL_COMPILE_JOBS_LESS TRUE)
if (PARALLEL_COMPILE_JOBS LESS NUMBER_OF_LOGICAL_CORES)
message(WARNING "The auto-calculated compile jobs limit (${PARALLEL_COMPILE_JOBS}) underutilizes CPU cores (${NUMBER_OF_LOGICAL_CORES}). Set PARALLEL_COMPILE_JOBS to override.")
endif()
endif ()
if (PARALLEL_COMPILE_JOBS AND (NOT NUMBER_OF_LOGICAL_CORES OR PARALLEL_COMPILE_JOBS LESS NUMBER_OF_LOGICAL_CORES))
set(CMAKE_JOB_POOL_COMPILE compile_job_pool${CMAKE_CURRENT_SOURCE_DIR})
string (REGEX REPLACE "[^a-zA-Z0-9]+" "_" CMAKE_JOB_POOL_COMPILE ${CMAKE_JOB_POOL_COMPILE})
set_property(GLOBAL APPEND PROPERTY JOB_POOLS ${CMAKE_JOB_POOL_COMPILE}=${PARALLEL_COMPILE_JOBS})
endif ()
if (NOT PARALLEL_LINK_JOBS AND TOTAL_PHYSICAL_MEMORY AND MAX_LINKER_MEMORY)
if (NOT PARALLEL_LINK_JOBS AND MAX_LINKER_MEMORY)
math(EXPR PARALLEL_LINK_JOBS ${TOTAL_PHYSICAL_MEMORY}/${MAX_LINKER_MEMORY})
if (NOT PARALLEL_LINK_JOBS)
set (PARALLEL_LINK_JOBS 1)
endif ()
if (NOT NUMBER_OF_LOGICAL_CORES OR PARALLEL_LINK_JOBS LESS NUMBER_OF_LOGICAL_CORES)
set (PARALLEL_LINK_JOBS_LESS TRUE)
if (PARALLEL_LINK_JOBS LESS NUMBER_OF_LOGICAL_CORES)
message(WARNING "The auto-calculated link jobs limit (${PARALLEL_LINK_JOBS}) underutilizes CPU cores (${NUMBER_OF_LOGICAL_CORES}). Set PARALLEL_LINK_JOBS to override.")
endif()
endif ()
@ -52,20 +47,16 @@ if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND ENABLE_THINLTO AND PARALLE
set (PARALLEL_LINK_JOBS 2)
endif()
if (PARALLEL_LINK_JOBS AND (NOT NUMBER_OF_LOGICAL_CORES OR PARALLEL_LINK_JOBS LESS NUMBER_OF_LOGICAL_CORES))
message(STATUS "Building sub-tree with ${PARALLEL_COMPILE_JOBS} compile jobs and ${PARALLEL_LINK_JOBS} linker jobs (system: ${NUMBER_OF_LOGICAL_CORES} cores, ${TOTAL_PHYSICAL_MEMORY} MB DRAM, 'OFF' means the native core count).")
if (PARALLEL_COMPILE_JOBS LESS NUMBER_OF_LOGICAL_CORES)
set(CMAKE_JOB_POOL_COMPILE compile_job_pool${CMAKE_CURRENT_SOURCE_DIR})
string (REGEX REPLACE "[^a-zA-Z0-9]+" "_" CMAKE_JOB_POOL_COMPILE ${CMAKE_JOB_POOL_COMPILE})
set_property(GLOBAL APPEND PROPERTY JOB_POOLS ${CMAKE_JOB_POOL_COMPILE}=${PARALLEL_COMPILE_JOBS})
endif ()
if (PARALLEL_LINK_JOBS LESS NUMBER_OF_LOGICAL_CORES)
set(CMAKE_JOB_POOL_LINK link_job_pool${CMAKE_CURRENT_SOURCE_DIR})
string (REGEX REPLACE "[^a-zA-Z0-9]+" "_" CMAKE_JOB_POOL_LINK ${CMAKE_JOB_POOL_LINK})
set_property(GLOBAL APPEND PROPERTY JOB_POOLS ${CMAKE_JOB_POOL_LINK}=${PARALLEL_LINK_JOBS})
endif ()
if (PARALLEL_COMPILE_JOBS OR PARALLEL_LINK_JOBS)
message(STATUS
"${CMAKE_CURRENT_SOURCE_DIR}: Have ${TOTAL_PHYSICAL_MEMORY} megabytes of memory.
Limiting concurrent linkers jobs to ${PARALLEL_LINK_JOBS} and compiler jobs to ${PARALLEL_COMPILE_JOBS} (system has ${NUMBER_OF_LOGICAL_CORES} logical cores)")
if (PARALLEL_COMPILE_JOBS_LESS)
message(WARNING "The autocalculated compile jobs limit (${PARALLEL_COMPILE_JOBS}) underutilizes CPU cores (${NUMBER_OF_LOGICAL_CORES}). Set PARALLEL_COMPILE_JOBS to override.")
endif()
if (PARALLEL_LINK_JOBS_LESS)
message(WARNING "The autocalculated link jobs limit (${PARALLEL_LINK_JOBS}) underutilizes CPU cores (${NUMBER_OF_LOGICAL_CORES}). Set PARALLEL_LINK_JOBS to override.")
endif()
endif ()

2
contrib/cctz vendored

@ -1 +1 @@
Subproject commit 5e05432420f9692418e2e12aff09859e420b14a2
Subproject commit 8529bcef5cd996b7c0f4d7475286b76b5d126c4c

View File

@ -0,0 +1,32 @@
---
slug: /en/sql-reference/aggregate-functions/reference/array_concat_agg
sidebar_position: 110
---
# array_concat_agg
- Alias of `groupArrayArray`. The function is case insensitive.
**Example**
```text
SELECT *
FROM t
┌─a───────┐
│ [1,2,3] │
│ [4,5] │
│ [6] │
└─────────┘
```
Query:
```sql
SELECT array_concat_agg(a) AS a
FROM t
┌─a─────────────┐
│ [1,2,3,4,5,6] │
└───────────────┘
```

View File

@ -1255,3 +1255,15 @@ Result:
│ A240 │
└──────────────────┘
```
## initcap
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
## initcapUTF8
Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.

View File

@ -1113,3 +1113,14 @@ A text with tags .
The content within <b>CDATA</b>
Do Nothing for 2 Minutes 2:00 &nbsp;
```
## initcap {#initcap}
Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.
## initcapUTF8 {#initcapUTF8}
Как [initcap](#initcap), предполагая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.

View File

@ -222,7 +222,6 @@ AggregateFunctionPtr AggregateFunctionFactory::tryGet(
: nullptr;
}
std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetProperties(String name) const
{
if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)

View File

@ -126,6 +126,7 @@ void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory)
factory.registerFunction("groupArray", { createAggregateFunctionGroupArray<false>, properties });
factory.registerAlias("array_agg", "groupArray", AggregateFunctionFactory::CaseInsensitive);
factory.registerAliasUnchecked("array_concat_agg", "groupArrayArray", AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("groupArraySample", { createAggregateFunctionGroupArraySample, properties });
factory.registerFunction("groupArrayLast", { createAggregateFunctionGroupArray<true>, properties });
}

View File

@ -8,7 +8,6 @@
#include <Common/MemoryTracker.h>
#include <Common/CurrentThread.h>
#include <Common/Arena.h>
#include <Interpreters/Context.h>

View File

@ -6223,7 +6223,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
const auto & insertion_table = scope_context->getInsertionTable();
if (!insertion_table.empty())
{
const auto & insert_structure = DatabaseCatalog::instance().getTable(insertion_table, scope_context)->getInMemoryMetadataPtr()->getColumns();
const auto & insert_structure = DatabaseCatalog::instance()
.getTable(insertion_table, scope_context)
->getInMemoryMetadataPtr()
->getColumns()
.getInsertable();
DB::ColumnsDescription structure_hint;
bool use_columns_from_insert_query = true;

View File

@ -52,35 +52,38 @@ public:
{
const auto & creator_map = getMap();
const auto & case_insensitive_creator_map = getCaseInsensitiveMap();
const String factory_name = getFactoryName();
String real_dict_name;
if (creator_map.count(real_name))
real_dict_name = real_name;
else if (auto real_name_lowercase = Poco::toLower(real_name); case_insensitive_creator_map.count(real_name_lowercase))
real_dict_name = real_name_lowercase;
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: can't create alias '{}', the real name '{}' is not registered",
factory_name, alias_name, real_name);
auto real_name_lowercase = Poco::toLower(real_name);
if (!creator_map.contains(real_name) && !case_insensitive_creator_map.contains(real_name_lowercase))
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"{}: can't create alias '{}', the real name '{}' is not registered",
getFactoryName(),
alias_name,
real_name);
registerAliasUnchecked(alias_name, real_name, case_sensitiveness);
}
/// We need sure the real_name exactly exists when call the function directly.
void registerAliasUnchecked(const String & alias_name, const String & real_name, CaseSensitiveness case_sensitiveness = CaseSensitive)
{
String alias_name_lowercase = Poco::toLower(alias_name);
if (creator_map.count(alias_name) || case_insensitive_creator_map.count(alias_name_lowercase))
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: the alias name '{}' is already registered as real name",
factory_name, alias_name);
String real_name_lowercase = Poco::toLower(real_name);
const String factory_name = getFactoryName();
if (case_sensitiveness == CaseInsensitive)
{
if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_dict_name).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: case insensitive alias name '{}' is not unique",
factory_name, alias_name);
if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_name).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: case insensitive alias name '{}' is not unique", factory_name, alias_name);
case_insensitive_name_mapping[alias_name_lowercase] = real_name;
}
if (!aliases.emplace(alias_name, real_dict_name).second)
if (!aliases.emplace(alias_name, real_name).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: alias name '{}' is not unique", factory_name, alias_name);
}
std::vector<String> getAllRegisteredNames() const override
{
std::vector<String> result;
@ -93,7 +96,7 @@ public:
bool isCaseInsensitive(const String & name) const
{
String name_lowercase = Poco::toLower(name);
return getCaseInsensitiveMap().count(name_lowercase) || case_insensitive_aliases.count(name_lowercase);
return getCaseInsensitiveMap().contains(name_lowercase) || case_insensitive_aliases.contains(name_lowercase);
}
const String & aliasTo(const String & name) const
@ -106,14 +109,11 @@ public:
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: name '{}' is not alias", getFactoryName(), name);
}
bool isAlias(const String & name) const
{
return aliases.count(name) || case_insensitive_aliases.contains(name);
}
bool isAlias(const String & name) const { return aliases.contains(name) || case_insensitive_aliases.contains(name); }
bool hasNameOrAlias(const String & name) const
{
return getMap().count(name) || getCaseInsensitiveMap().count(name) || isAlias(name);
return getMap().contains(name) || getCaseInsensitiveMap().contains(name) || isAlias(name);
}
/// Return the canonical name (the name used in registration) if it's different from `name`.
@ -129,7 +129,7 @@ public:
private:
using InnerMap = std::unordered_map<String, Value>; // name -> creator
using AliasMap = std::unordered_map<String, String>; // alias -> original type
using AliasMap = std::unordered_map<String, String>; // alias -> original name
virtual const InnerMap & getMap() const = 0;
virtual const InnerMap & getCaseInsensitiveMap() const = 0;

View File

@ -25,8 +25,6 @@ void Pool::Entry::incrementRefCount()
/// First reference, initialize thread
if (data->ref_count.fetch_add(1) == 0)
mysql_thread_init();
chassert(!data->removed_from_pool);
}
@ -43,7 +41,10 @@ void Pool::Entry::decrementRefCount()
/// In Pool::Entry::disconnect() we remove connection from the list of pool's connections.
/// So now we must deallocate the memory.
if (data->removed_from_pool)
{
data->conn.disconnect();
::delete data;
}
}
}
@ -230,8 +231,6 @@ void Pool::removeConnection(Connection* connection)
std::lock_guard lock(mutex);
if (connection)
{
if (!connection->removed_from_pool)
connection->conn.disconnect();
connections.remove(connection);
connection->removed_from_pool = true;
}
@ -240,6 +239,7 @@ void Pool::removeConnection(Connection* connection)
void Pool::Entry::disconnect()
{
// Remove the Entry from the Pool. Actual disconnection is delayed until refcount == 0.
pool->removeConnection(data);
}

View File

@ -34,7 +34,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());
codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
return 0;
}

View File

@ -34,7 +34,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());
codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
return 0;
}

View File

@ -292,10 +292,10 @@ try
DB::Memory<> memory;
memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
codec_128->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
codec_128->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);
memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
codec_256->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
codec_256->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);
return 0;
}
catch (...)

View File

@ -24,7 +24,7 @@ try
return 0;
const auto * p = reinterpret_cast<const AuxiliaryRandomData *>(data);
auto codec = DB::getCompressionCodecLZ4(static_cast<int>(p->level));
auto codec = DB::getCompressionCodecLZ4(p->level);
size_t output_buffer_size = p->decompressed_size % 65536;
size -= sizeof(AuxiliaryRandomData);
@ -37,7 +37,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER);
codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
return 0;
}

View File

@ -28,7 +28,6 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_DEEP_RECURSION;
}
constexpr Null NEGATIVE_INFINITY{Null::Value::NegativeInfinity};
@ -42,13 +41,10 @@ using FieldVector = std::vector<Field, AllocatorWithMemoryTracking<Field>>;
/// construct a Field of Array or a Tuple type. An alternative approach would be
/// to construct both of these types from FieldVector, and have the caller
/// specify the desired Field type explicitly.
/// As the result stack overflow on destruction is possible
/// and to avoid it we need to count the depth and have a threshold.
#define DEFINE_FIELD_VECTOR(X) \
struct X : public FieldVector \
{ \
using FieldVector::FieldVector; \
uint8_t nested_field_depth = 0; \
}
DEFINE_FIELD_VECTOR(Array);
@ -65,7 +61,6 @@ using FieldMap = std::map<String, Field, std::less<>, AllocatorWithMemoryTrackin
struct X : public FieldMap \
{ \
using FieldMap::FieldMap; \
uint8_t nested_field_depth = 0; \
}
DEFINE_FIELD_MAP(Object);
@ -296,12 +291,6 @@ decltype(auto) castToNearestFieldType(T && x)
*/
#define DBMS_MIN_FIELD_SIZE 32
/// Note: uint8_t is used for storing depth value.
#if defined(SANITIZER) || !defined(NDEBUG)
#define DBMS_MAX_NESTED_FIELD_DEPTH 64
#else
#define DBMS_MAX_NESTED_FIELD_DEPTH 255
#endif
/** Discriminated union of several types.
* Made for replacement of `boost::variant`
@ -682,49 +671,6 @@ private:
Types::Which which;
/// StorageType and Original are the same for Array, Tuple, Map, Object
template <typename StorageType, typename Original>
uint8_t calculateAndCheckFieldDepth(Original && x)
{
uint8_t result = 0;
if constexpr (std::is_same_v<StorageType, Array>
|| std::is_same_v<StorageType, Tuple>
|| std::is_same_v<StorageType, Map>
|| std::is_same_v<StorageType, Object>)
{
result = x.nested_field_depth;
auto get_depth = [](const Field & elem)
{
switch (elem.which)
{
case Types::Array:
return elem.template get<Array>().nested_field_depth;
case Types::Tuple:
return elem.template get<Tuple>().nested_field_depth;
case Types::Map:
return elem.template get<Map>().nested_field_depth;
case Types::Object:
return elem.template get<Object>().nested_field_depth;
default:
return static_cast<uint8_t>(0);
}
};
if constexpr (std::is_same_v<StorageType, Object>)
for (auto & [_, value] : x)
result = std::max(get_depth(value), result);
else
for (auto & value : x)
result = std::max(get_depth(value), result);
}
if (result >= DBMS_MAX_NESTED_FIELD_DEPTH)
throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Too deep Field");
return result;
}
/// Assuming there was no allocated state or it was deallocated (see destroy).
template <typename T>
@ -738,17 +684,7 @@ private:
// we must initialize the entire wide stored type, and not just the
// nominal type.
using StorageType = NearestFieldType<UnqualifiedType>;
/// Incrementing the depth since we create a new Field.
auto depth = calculateAndCheckFieldDepth<StorageType>(x);
new (&storage) StorageType(std::forward<T>(x));
if constexpr (std::is_same_v<StorageType, Array>
|| std::is_same_v<StorageType, Tuple>
|| std::is_same_v<StorageType, Map>
|| std::is_same_v<StorageType, Object>)
reinterpret_cast<StorageType *>(&storage)->nested_field_depth = depth + 1;
which = TypeToEnum<UnqualifiedType>::value;
}
@ -845,7 +781,7 @@ private:
}
template <typename T>
ALWAYS_INLINE void destroy()
void destroy()
{
T * MAY_ALIAS ptr = reinterpret_cast<T*>(&storage);
ptr->~T();

View File

@ -62,7 +62,7 @@ DataTypePtr DataTypeFactory::getImpl(const String & full_name) const
}
else
{
ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", DBMS_DEFAULT_MAX_QUERY_SIZE, data_type_max_parse_depth);
ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", false, data_type_max_parse_depth);
}
return getImpl<nullptr_on_error>(ast);

View File

@ -1521,10 +1521,8 @@ struct Transformer
if constexpr (std::is_same_v<Additions, DateTimeAccurateConvertStrategyAdditions>
|| std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)
{
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
bool is_valid_input = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL;
# pragma clang diagnostic pop
if (!is_valid_input)
{
if constexpr (std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)

View File

@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
}
else
{
static const Poco::UTF8Encoding utf8;
size_t src_sequence_length = UTF8::seqLength(*src);
/// In case partial buffer was passed (due to SSE optimization)
/// we cannot convert it with current src_end, but we may have more

66
src/Functions/initcap.cpp Normal file
View File

@ -0,0 +1,66 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>
namespace DB
{
namespace
{
struct InitcapImpl
{
static void vector(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (data.empty())
return;
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), res_data.data());
}
static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
bool prev_alphanum = false;
for (; src < src_end; ++src, ++dst)
{
char c = *src;
bool alphanum = isAlphaNumericASCII(c);
if (alphanum && !prev_alphanum)
if (isAlphaASCII(c))
*dst = toUpperIfAlphaASCII(c);
else
*dst = c;
else if (isAlphaASCII(c))
*dst = toLowerIfAlphaASCII(c);
else
*dst = c;
prev_alphanum = alphanum;
}
}
};
struct NameInitcap
{
static constexpr auto name = "initcap";
};
using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>;
}
REGISTER_FUNCTION(Initcap)
{
factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,114 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/LowerUpperUTF8Impl.h>
#include <Functions/FunctionFactory.h>
#include <Poco/Unicode.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace
{
struct InitcapUTF8Impl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (data.empty())
return;
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), offsets, res_data.data());
}
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument");
}
static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum)
{
size_t src_sequence_length = UTF8::seqLength(*src);
auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
if (src_code_point)
{
bool alpha = Poco::Unicode::isAlpha(*src_code_point);
bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point);
int dst_code_point = *src_code_point;
if (alphanum && !prev_alphanum)
{
if (alpha)
dst_code_point = Poco::Unicode::toUpper(*src_code_point);
}
else if (alpha)
{
dst_code_point = Poco::Unicode::toLower(*src_code_point);
}
prev_alphanum = alphanum;
if (dst_code_point > 0)
{
size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
assert(dst_sequence_length <= 4);
if (dst_sequence_length == src_sequence_length)
{
src += dst_sequence_length;
dst += dst_sequence_length;
return;
}
}
}
*dst = *src;
++dst;
++src;
prev_alphanum = false;
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
{
const auto * offset_it = offsets.begin();
const UInt8 * begin = src;
/// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
while (src < src_end)
{
const UInt8 * row_end = begin + *offset_it;
chassert(row_end >= src);
bool prev_alphanum = false;
while (src < row_end)
processCodePoint(src, row_end, dst, prev_alphanum);
++offset_it;
}
}
};
struct NameInitcapUTF8
{
static constexpr auto name = "initcapUTF8";
};
using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>;
}
REGISTER_FUNCTION(InitcapUTF8)
{
factory.registerFunction<FunctionInitcapUTF8>();
}
}

View File

@ -1574,7 +1574,11 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const
uint64_t use_structure_from_insertion_table_in_table_functions = getSettingsRef().use_structure_from_insertion_table_in_table_functions;
if (use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable())
{
const auto & insert_structure = DatabaseCatalog::instance().getTable(getInsertionTable(), shared_from_this())->getInMemoryMetadataPtr()->getColumns();
const auto & insert_structure = DatabaseCatalog::instance()
.getTable(getInsertionTable(), shared_from_this())
->getInMemoryMetadataPtr()
->getColumns()
.getInsertable();
DB::ColumnsDescription structure_hint;
bool use_columns_from_insert_query = true;

View File

@ -42,4 +42,4 @@ clickhouse_add_executable(codegen_select_fuzzer ${FUZZER_SRCS})
set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier")
target_include_directories(codegen_select_fuzzer SYSTEM BEFORE PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(codegen_select_fuzzer PRIVATE ch_contrib::protobuf ch_contrib::protobuf_mutator ch_contrib::protoc dbms ${LIB_FUZZING_ENGINE})
target_link_libraries(codegen_select_fuzzer PRIVATE ch_contrib::protobuf_mutator ch_contrib::protoc dbms ${LIB_FUZZING_ENGINE})

View File

@ -818,9 +818,10 @@ def test_start_stop_moves(start_cluster, name, engine):
node1.query(f"SYSTEM STOP MOVES {name}")
node1.query(f"SYSTEM STOP MERGES {name}")
first_part = None
for i in range(5):
data = [] # 5MB in total
for i in range(5):
for _ in range(5):
data.append(get_random_string(1024 * 1024)) # 1MB row
# jbod size is 40MB, so lets insert 5MB batch 7 times
node1.query_with_retry(
@ -829,7 +830,13 @@ def test_start_stop_moves(start_cluster, name, engine):
)
)
first_part = get_oldest_part(node1, name)
# we cannot rely simply on modification time of part because it can be changed
# by different background operations so we explicitly check after the first
# part is inserted
if i == 0:
first_part = get_oldest_part(node1, name)
assert first_part is not None
used_disks = get_used_disks_for_table(node1, name)

View File

@ -18,7 +18,7 @@ select distinct a from distinct_in_order settings max_block_size=10, max_threads
select '-- create table with not only primary key columns';
drop table if exists distinct_in_order sync;
create table distinct_in_order (a int, b int, c int) engine=MergeTree() order by (a, b);
create table distinct_in_order (a int, b int, c int) engine=MergeTree() order by (a, b) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into distinct_in_order select number % number, number % 5, number % 10 from numbers(1,1000000);
select '-- distinct with primary key prefix only';
@ -59,16 +59,16 @@ drop table if exists distinct_in_order sync;
select '-- check that distinct in order returns the same result as ordinary distinct';
drop table if exists distinct_cardinality_low sync;
CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium);
CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
INSERT INTO distinct_cardinality_low SELECT number % 1e1, number % 1e2, number % 1e3 FROM numbers_mt(1e4);
drop table if exists distinct_in_order sync;
drop table if exists ordinary_distinct sync;
select '-- check that distinct in order WITH order by returns the same result as ordinary distinct';
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1;
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0;
select count() as diff from (select distinct * from distinct_in_order except select * from ordinary_distinct);
@ -76,9 +76,9 @@ drop table if exists distinct_in_order sync;
drop table if exists ordinary_distinct sync;
select '-- check that distinct in order WITHOUT order by returns the same result as ordinary distinct';
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into distinct_in_order select distinct * from distinct_cardinality_low settings optimize_distinct_in_order=1;
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into ordinary_distinct select distinct * from distinct_cardinality_low settings optimize_distinct_in_order=0;
select count() as diff from (select distinct * from distinct_in_order except select * from ordinary_distinct);
@ -86,9 +86,9 @@ drop table if exists distinct_in_order;
drop table if exists ordinary_distinct;
select '-- check that distinct in order WITHOUT order by and WITH filter returns the same result as ordinary distinct';
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into distinct_in_order select distinct * from distinct_cardinality_low where low > 0 settings optimize_distinct_in_order=1;
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium);
create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into ordinary_distinct select distinct * from distinct_cardinality_low where low > 0 settings optimize_distinct_in_order=0;
select count() as diff from (select distinct * from distinct_in_order except select * from ordinary_distinct);
@ -102,12 +102,12 @@ drop table if exists sorting_key_contain_function;
select '-- bug 42185, distinct in order and empty sort description';
select '-- distinct in order, sorting key tuple()';
create table sorting_key_empty_tuple (a int, b int) engine=MergeTree() order by tuple();
create table sorting_key_empty_tuple (a int, b int) engine=MergeTree() order by tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into sorting_key_empty_tuple select number % 2, number % 5 from numbers(1,10);
select distinct a from sorting_key_empty_tuple;
select '-- distinct in order, sorting key contains function';
create table sorting_key_contain_function (datetime DateTime, a int) engine=MergeTree() order by (toDate(datetime));
create table sorting_key_contain_function (datetime DateTime, a int) engine=MergeTree() order by (toDate(datetime)) SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into sorting_key_contain_function values ('2000-01-01', 1);
insert into sorting_key_contain_function values ('2000-01-01', 2);
select distinct datetime from sorting_key_contain_function;

View File

@ -364,6 +364,8 @@ in
inIgnoreSet
indexHint
indexOf
initcap
initcapUTF8
initialQueryID
initializeAggregation
intDiv

View File

@ -0,0 +1,13 @@
Hello
Hello
Hello World
Yeah, Well, I`M Gonna Go Build My Own Theme Park
Crc32ieee Is The Best Function
42ok
Hello
Yeah, Well, I`M Gonna Go Build My Own Theme Park
Привет, Как Дела?
Ätsch, Bätsch
We Dont Support Cases When Lowercase And Uppercase Characters Occupy Different Number Of Bytes In Utf-8. As An Example, This Happens For ß And ẞ.

View File

@ -0,0 +1,14 @@
select initcap('');
select initcap('Hello');
select initcap('hello');
select initcap('hello world');
select initcap('yeah, well, i`m gonna go build my own theme park');
select initcap('CRC32IEEE is the best function');
select initcap('42oK');
select initcapUTF8('');
select initcapUTF8('Hello');
select initcapUTF8('yeah, well, i`m gonna go build my own theme park');
select initcapUTF8('привет, как дела?');
select initcapUTF8('ätsch, bätsch');
select initcapUTF8('We dont support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8. As an example, this happens for ß and ẞ.');

View File

@ -0,0 +1,9 @@
drop table if exists test;
create table test
(
n1 UInt32,
n2 UInt32 alias murmurHash3_32(n1),
n3 UInt32 materialized n2 + 1
)engine=MergeTree order by n1;
insert into test select * from generateRandom() limit 10;
drop table test;

View File

@ -0,0 +1,5 @@
[1,2,3,4,5,6]
[1,2,3,4,5,6]
1 [1,2,3]
2 [4,5]
3 [6]

View File

@ -0,0 +1,9 @@
drop table if exists t;
create table t (n UInt32, a Array(Int32)) engine=Memory;
insert into t values (1, [1,2,3]), (2, [4,5]), (3, [6]);
select array_concat_agg(a) from t;
select ArrAy_cOncAt_aGg(a) from t;
select n, array_concat_agg(a) from t group by n order by n;
drop table t;

View File

@ -991,6 +991,7 @@ addressToLine
addressToLineWithInlines
addressToSymbol
adviced
agg
aggregatefunction
aggregatingmergetree
aggregatio
@ -1582,6 +1583,8 @@ indexOf
infi
initialQueryID
initializeAggregation
initcap
initcapUTF
injective
innogames
inodes