Merge remote-tracking branch 'origin/master' into fill_with_by_sorting_prefix_2

This commit is contained in:
Igor Nikonov 2023-05-15 14:40:46 +00:00
commit 4d168400ae
38 changed files with 224 additions and 49 deletions

View File

@ -1218,12 +1218,16 @@ Rounds the time to the half hour.
Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 100 + MM). Accepts a second optional timezone argument. If provided, the timezone must be a string constant.
### example
```sql
**Example**
``` sql
SELECT
toYYYYMM(now(), 'US/Eastern')
```
```response
Result:
``` text
┌─toYYYYMM(now(), 'US/Eastern')─┐
│ 202303 │
└───────────────────────────────┘
@ -1233,11 +1237,15 @@ SELECT
Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 10000 + MM \* 100 + DD). Accepts a second optional timezone argument. If provided, the timezone must be a string constant.
### example
**Example**
```sql
SELECT
toYYYYMMDD(now(), 'US/Eastern')
```
Result:
```response
┌─toYYYYMMDD(now(), 'US/Eastern')─┐
│ 20230302 │
@ -1248,11 +1256,15 @@ SELECT
Converts a date or date with time to a UInt64 number containing the year and month number (YYYY \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss). Accepts a second optional timezone argument. If provided, the timezone must be a string constant.
### example
**Example**
```sql
SELECT
toYYYYMMDDhhmmss(now(), 'US/Eastern')
```
Result:
```response
┌─toYYYYMMDDhhmmss(now(), 'US/Eastern')─┐
│ 20230302112209 │

View File

@ -10,6 +10,7 @@
#include <Common/OpenSSLHelpers.h>
#include <Poco/SHA1Engine.h>
#include <base/types.h>
#include <base/hex.h>
#include <boost/algorithm/hex.hpp>
#include <boost/algorithm/string/case_conv.hpp>

View File

@ -2,6 +2,7 @@
#include <Columns/ColumnArray.h>
#include <Common/assert_cast.h>
#include <Common/Arena.h>
#include <base/arithmeticOverflow.h>
#include <DataTypes/DataTypeArray.h>
#include <AggregateFunctions/IAggregateFunction.h>

View File

@ -10,6 +10,7 @@
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypesNumber.h>
#include <base/StringRef.h>
#include <Common/Arena.h>
#include <Common/assert_cast.h>
#include <DataTypes/DataTypeNullable.h>
#include <AggregateFunctions/IAggregateFunction.h>

View File

@ -211,6 +211,7 @@ endif()
if (TARGET ch_contrib::jemalloc)
target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc)
endif()
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash)
add_subdirectory(Access/Common)
add_subdirectory(Common/ZooKeeper)
@ -463,7 +464,7 @@ endif ()
if (TARGET ch_contrib::ldap)
dbms_target_link_libraries (PRIVATE ch_contrib::ldap ch_contrib::lber)
endif ()
dbms_target_link_libraries (PRIVATE ch_contrib::sparsehash)
dbms_target_link_libraries (PUBLIC ch_contrib::sparsehash)
if (TARGET ch_contrib::protobuf)
dbms_target_link_libraries (PRIVATE ch_contrib::protobuf)

View File

@ -663,12 +663,10 @@ Names Block::getDataTypeNames() const
Block::NameMap Block::getNamesToIndexesMap() const
{
NameMap res;
res.reserve(index_by_name.size());
NameMap res(index_by_name.size());
res.set_empty_key(StringRef{});
for (const auto & [name, index] : index_by_name)
res[name] = index;
return res;
}

View File

@ -5,13 +5,11 @@
#include <Core/ColumnsWithTypeAndName.h>
#include <Core/NamesAndTypes.h>
#include <Common/HashTable/HashMap.h>
#include <initializer_list>
#include <list>
#include <map>
#include <set>
#include <vector>
#include <sparsehash/dense_hash_map>
namespace DB
@ -97,7 +95,7 @@ public:
Names getDataTypeNames() const;
/// Hash table match `column name -> position in the block`.
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
using NameMap = ::google::dense_hash_map<StringRef, size_t, StringRefHash>;
NameMap getNamesToIndexesMap() const;
Serializations getSerializations() const;

View File

@ -749,6 +749,7 @@ class IColumn;
M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
M(Bool, allow_experimental_undrop_table_query, false, "Allow to use undrop query to restore dropped table in a limited time", 0) \
M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \
M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -7,6 +7,7 @@
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Common/AlignedBuffer.h>
#include <Common/Arena.h>
#include <Formats/FormatSettings.h>
#include <Formats/ProtobufReader.h>

View File

@ -26,8 +26,8 @@ void ColumnMapping::addColumns(
{
names_of_columns.push_back(name);
const auto * column_it = column_indexes_by_names.find(name);
if (!column_it)
const auto column_it = column_indexes_by_names.find(name);
if (column_it == column_indexes_by_names.end())
{
if (settings.skip_unknown_fields)
{
@ -43,7 +43,7 @@ void ColumnMapping::addColumns(
name, column_indexes_for_input_fields.size());
}
const auto column_index = column_it->getMapped();
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing format header: {}", name);

View File

@ -41,6 +41,7 @@
#include <Common/FieldVisitorsAccurateComparison.h>
#include <Common/assert_cast.h>
#include <Common/typeid_cast.h>
#include <Common/Arena.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <Interpreters/Context.h>

View File

@ -9,7 +9,7 @@ namespace DB
template <typename A>
struct BitCountImpl
{
using ResultType = UInt8;
using ResultType = std::conditional_t<(sizeof(A) * 8 >= 256), UInt16, UInt8>;
static constexpr bool allow_string_or_fixed_string = true;
static inline ResultType apply(A a)
@ -17,6 +17,13 @@ struct BitCountImpl
/// We count bits in the value representation in memory. For example, we support floats.
/// We need to avoid sign-extension when converting signed numbers to larger type. So, uint8_t(-1) has 8 bits.
if constexpr (is_big_int_v<A>)
{
ResultType res = 0;
for (auto item : a.items)
res += __builtin_popcountll(item);
return res;
}
if constexpr (std::is_same_v<A, UInt64> || std::is_same_v<A, Int64>)
return __builtin_popcountll(a);
if constexpr (std::is_same_v<A, UInt32> || std::is_same_v<A, Int32> || std::is_unsigned_v<A>)

View File

@ -7,6 +7,8 @@
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeString.h>
#include <Interpreters/Context.h>
#include <Functions/keyvaluepair/impl/KeyValuePairExtractor.h>
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
#include <Functions/keyvaluepair/ArgumentExtractor.h>
@ -41,6 +43,13 @@ class ExtractKeyValuePairs : public IFunction
builder.withQuotingCharacter(parsed_arguments.quoting_character.value());
}
bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_kvp_max_pairs_per_row == 0;
if (!is_number_of_pairs_unlimited)
{
builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row);
}
return builder.build();
}
@ -73,7 +82,7 @@ class ExtractKeyValuePairs : public IFunction
}
public:
ExtractKeyValuePairs() = default;
explicit ExtractKeyValuePairs(ContextPtr context_) : context(context_) {}
static constexpr auto name = Name::name;
@ -82,9 +91,9 @@ public:
return name;
}
static FunctionPtr create(ContextPtr)
static FunctionPtr create(ContextPtr context)
{
return std::make_shared<ExtractKeyValuePairs>();
return std::make_shared<ExtractKeyValuePairs>(context);
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
@ -120,6 +129,9 @@ public:
{
return {1, 2, 3, 4};
}
private:
ContextPtr context;
};
struct NameExtractKeyValuePairs

View File

@ -13,6 +13,7 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int LIMIT_EXCEEDED;
}
/*
@ -25,8 +26,8 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
using NextState = DB::extractKV::StateHandler::NextState;
public:
explicit CHKeyValuePairExtractor(StateHandler state_handler_)
: state_handler(std::move(state_handler_))
explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_)
: state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_)
{}
uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override
@ -113,11 +114,16 @@ private:
NextState flushPair(const std::string_view & file, auto & key,
auto & value, uint64_t & row_offset)
{
row_offset++;
if (row_offset > max_number_of_pairs)
{
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
}
key.commit();
value.commit();
row_offset++;
return {0, file.empty() ? State::END : State::WAITING_KEY};
}
@ -128,6 +134,7 @@ private:
}
StateHandler state_handler;
uint64_t max_number_of_pairs;
};
}

View File

@ -31,6 +31,12 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping()
return *this;
}
KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_)
{
max_number_of_pairs = max_number_of_pairs_;
return *this;
}
std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::build() const
{
if (with_escaping)
@ -46,9 +52,9 @@ namespace
using namespace extractKV;
template <typename T>
auto makeStateHandler(const T && handler)
auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs)
{
return std::make_shared<CHKeyValuePairExtractor<T>>(handler);
return std::make_shared<CHKeyValuePairExtractor<T>>(handler, max_number_of_pairs);
}
}
@ -57,14 +63,14 @@ std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::buildWithou
{
auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters);
return makeStateHandler(NoEscapingStateHandler(configuration));
return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs);
}
std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::buildWithEscaping() const
{
auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters);
return makeStateHandler(InlineEscapingStateHandler(configuration));
return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs);
}
}

View File

@ -20,6 +20,8 @@ public:
KeyValuePairExtractorBuilder & withEscaping();
KeyValuePairExtractorBuilder & withMaxNumberOfPairs(uint64_t max_number_of_pairs_);
std::shared_ptr<KeyValuePairExtractor> build() const;
private:
@ -27,6 +29,7 @@ private:
char key_value_delimiter = ':';
char quoting_character = '"';
std::vector<char> item_delimiters = {' ', ',', ';'};
uint64_t max_number_of_pairs = std::numeric_limits<uint64_t>::max();
std::shared_ptr<KeyValuePairExtractor> buildWithEscaping() const;

View File

@ -1,5 +1,6 @@
#pragma once
#include <Common/ProfileEvents.h>
#include <Core/NamesAndTypes.h>
#include <Core/NamesAndAliases.h>
#include <Core/Settings.h>

View File

@ -13,6 +13,7 @@
#include <Core/NamesAndAliases.h>
#include <Interpreters/SystemLog.h>
#include <base/types.h>
#include <Common/ProfileEvents.h>
namespace ProfileEvents
{

View File

@ -1,6 +1,7 @@
#include <Interpreters/Context.h>
#include <Common/tests/gtest_global_context.h>
#include <gtest/gtest.h>
#include <thread>
using namespace DB;

View File

@ -64,20 +64,22 @@ inline size_t BSONEachRowRowInputFormat::columnIndex(const StringRef & name, siz
/// Optimization by caching the order of fields (which is almost always the same)
/// and a quick check to match the next expected field, instead of searching the hash table.
if (prev_positions.size() > key_index && prev_positions[key_index] && name == prev_positions[key_index]->getKey())
if (prev_positions.size() > key_index
&& prev_positions[key_index] != Block::NameMap::const_iterator{}
&& name == prev_positions[key_index]->first)
{
return prev_positions[key_index]->getMapped();
return prev_positions[key_index]->second;
}
else
{
auto * it = name_map.find(name);
const auto it = name_map.find(name);
if (it)
if (it != name_map.end())
{
if (key_index < prev_positions.size())
prev_positions[key_index] = it;
return it->getMapped();
return it->second;
}
else
return UNKNOWN_FIELD;

View File

@ -91,7 +91,7 @@ private:
Block::NameMap name_map;
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
std::vector<Block::NameMap::LookupResult> prev_positions;
std::vector<Block::NameMap::const_iterator> prev_positions;
DataTypes types;

View File

@ -128,7 +128,7 @@ Chunk JSONColumnsBlockInputFormatBase::generate()
{
/// Check if this name appears in header. If no, skip this column or throw
/// an exception according to setting input_format_skip_unknown_fields
if (!name_to_index.has(*column_name))
if (name_to_index.find(*column_name) == name_to_index.end())
{
if (!format_settings.skip_unknown_fields)
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column found in input data: {}", *column_name);

View File

@ -71,21 +71,20 @@ inline size_t JSONEachRowRowInputFormat::columnIndex(StringRef name, size_t key_
/// and a quick check to match the next expected field, instead of searching the hash table.
if (prev_positions.size() > key_index
&& prev_positions[key_index]
&& name == prev_positions[key_index]->getKey())
&& prev_positions[key_index] != Block::NameMap::const_iterator{}
&& name == prev_positions[key_index]->first)
{
return prev_positions[key_index]->getMapped();
return prev_positions[key_index]->second;
}
else
{
auto * it = name_map.find(name);
if (it)
const auto it = name_map.find(name);
if (it != name_map.end())
{
if (key_index < prev_positions.size())
prev_positions[key_index] = it;
return it->getMapped();
return it->second;
}
else
return UNKNOWN_FIELD;

View File

@ -71,11 +71,10 @@ private:
/// for row like {..., "non-nullable column name" : null, ...}
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
NameMap name_map;
Block::NameMap name_map;
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
std::vector<NameMap::LookupResult> prev_positions;
std::vector<Block::NameMap::const_iterator> prev_positions;
bool allow_new_rows = true;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Core/QueryProcessingStage.h>
#include <Core/UUID.h>
#include <Parsers/IAST_fwd.h>
#include <Processors/QueryPlan/QueryPlan.h>
#include <Processors/ResizeProcessor.h>

View File

@ -42,6 +42,10 @@ bool canUseProjectionForReadingStep(ReadFromMergeTree * reading)
if (reading->getContext()->getSettingsRef().allow_experimental_query_deduplication)
return false;
// Currently projection don't support settings which implicitly modify aggregate functions.
if (reading->getContext()->getSettingsRef().aggregate_functions_null_for_empty)
return false;
return true;
}

View File

@ -11,10 +11,11 @@
#include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
#include <Processors/Formats/Impl/ArrowColumnToCHColumn.h>
#include <Formats/FormatFactory.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <parquet/arrow/reader.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnNullable.h>
#include <IO/ReadHelpers.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <parquet/arrow/reader.h>
#include <ranges>
namespace fs = std::filesystem;

View File

@ -1,6 +1,7 @@
#include <Storages/DataLakes/HudiMetadataParser.h>
#include <Common/logger_useful.h>
#include <ranges>
#include <base/find_symbols.h>
#include <Poco/String.h>
#include "config.h"
#include <filesystem>

View File

@ -12,6 +12,7 @@
#include <Storages/StorageS3.h>
#include <Processors/Formats/Impl/AvroRowInputFormat.h>
#include <Formats/FormatFactory.h>
#include <IO/ReadHelpers.h>
#include <Poco/JSON/Array.h>
#include <Poco/JSON/Object.h>

View File

@ -7,7 +7,7 @@
namespace DB
{
/* Implements storage in the MongoDB database.
* Use ENGINE = mysql(host_port, database_name, table_name, user_name, password)
* Use ENGINE = MongoDB(host:port, database, collection, user, password [, options]);
* Read only.
*/

View File

@ -16,6 +16,7 @@
#include <Poco/URI.h>
#include <IO/S3/getObjectInfo.h>
#include <IO/CompressionMethod.h>
#include <IO/SeekableReadBuffer.h>
#include <Interpreters/Context.h>
#include <Interpreters/threadPoolCallbackRunner.h>
#include <Storages/Cache/SchemaCache.h>

View File

@ -530,6 +530,7 @@ class SettingsRandomizer:
"max_threads": lambda: random.randint(1, 64),
"optimize_or_like_chain": lambda: random.randint(0, 1),
"optimize_read_in_order": lambda: random.randint(0, 1),
"enable_multiple_prewhere_read_steps": lambda: random.randint(0, 1),
"read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100),
"optimize_aggregation_in_order": lambda: random.randint(0, 1),
"aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000),

View File

@ -0,0 +1,8 @@
DROP TABLE IF EXISTS t1;
CREATE TABLE t1 (c0 Int32, PRIMARY KEY (c0)) ENGINE=MergeTree;
INSERT INTO t1 VALUES (1554690688);
SELECT MIN(t1.c0) FROM t1 SETTINGS aggregate_functions_null_for_empty = 1;
DROP TABLE IF EXISTS t1;

View File

@ -292,6 +292,35 @@ SELECT
x;
{'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'}
-- { echoOn }
SET extract_kvp_max_pairs_per_row = 2;
-- Should be allowed because it no longer exceeds the max number of pairs
-- expected output: {'key1':'value1','key2':'value2'}
WITH
extractKeyValuePairs('key1:value1,key2:value2') AS s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
{'key1':'value1','key2':'value2'}
SET extract_kvp_max_pairs_per_row = 0;
-- Should be allowed because max pairs per row is set to 0 (unlimited)
-- expected output: {'key1':'value1','key2':'value2'}
WITH
extractKeyValuePairs('key1:value1,key2:value2') AS s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
{'key1':'value1','key2':'value2'}
-- should not fail because pair delimiters contains 8 characters, which is within the limit
WITH
extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map,

View File

@ -414,7 +414,49 @@ WITH
SELECT
x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
-- Should fail allowed because it exceeds the max number of pairs
SET extract_kvp_max_pairs_per_row = 1;
WITH
extractKeyValuePairs('key1:value1,key2:value2') AS s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x; -- {serverError LIMIT_EXCEEDED}
-- { echoOn }
SET extract_kvp_max_pairs_per_row = 2;
-- Should be allowed because it no longer exceeds the max number of pairs
-- expected output: {'key1':'value1','key2':'value2'}
WITH
extractKeyValuePairs('key1:value1,key2:value2') AS s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
SET extract_kvp_max_pairs_per_row = 0;
-- Should be allowed because max pairs per row is set to 0 (unlimited)
-- expected output: {'key1':'value1','key2':'value2'}
WITH
extractKeyValuePairs('key1:value1,key2:value2') AS s_map,
CAST(
arrayMap(
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
),
'Map(String,String)'
) AS x
SELECT
x;
-- should not fail because pair delimiters contains 8 characters, which is within the limit
WITH
extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map,

View File

@ -0,0 +1,13 @@
128
256
128
256
127
255
126
255
64
UInt8
UInt16
UInt8
UInt16

View File

@ -0,0 +1,19 @@
SELECT bitCount(CAST(-1 AS UInt128));
SELECT bitCount(CAST(-1 AS UInt256));
SELECT bitCount(CAST(-1 AS Int128));
SELECT bitCount(CAST(-1 AS Int256));
SELECT bitCount(CAST(-1 AS UInt128) - 1);
SELECT bitCount(CAST(-1 AS UInt256) - 2);
SELECT bitCount(CAST(-1 AS Int128) - 3);
SELECT bitCount(CAST(-1 AS Int256) - 4);
SELECT bitCount(CAST(0xFFFFFFFFFFFFFFFF AS Int256));
SELECT toTypeName(bitCount(1::UInt128));
SELECT toTypeName(bitCount(1::UInt256));
SELECT toTypeName(bitCount(1::Int128));
SELECT toTypeName(bitCount(1::Int256));