Merge remote-tracking branch 'origin/master' into pr-local-plan

This commit is contained in:
Igor Nikonov 2024-07-24 19:51:47 +00:00
commit 420075ada0
140 changed files with 2672 additions and 1820 deletions

View File

@ -269,7 +269,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF
python3 ./tests/ci/ci_buddy.py --check-wf-status python3 ./tests/ci/ci_buddy.py --check-wf-status

View File

@ -135,7 +135,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF
python3 ./tests/ci/ci_buddy.py --check-wf-status python3 ./tests/ci/ci_buddy.py --check-wf-status

View File

@ -108,7 +108,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF
python3 ./tests/ci/ci_buddy.py --check-wf-status python3 ./tests/ci/ci_buddy.py --check-wf-status

View File

@ -54,7 +54,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF
python3 ./tests/ci/ci_buddy.py --check-wf-status python3 ./tests/ci/ci_buddy.py --check-wf-status

View File

@ -152,8 +152,9 @@ jobs:
CheckReadyForMerge: CheckReadyForMerge:
if: ${{ !cancelled() }} if: ${{ !cancelled() }}
# Test_2 or Test_3 must not have jobs required for Mergeable check # Test_2 or Test_3 do not have the jobs required for Mergeable check,
needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_Report, Tests_1] # however, set them as "needs" to get all checks results before the automatic merge occurs.
needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_Report, Tests_1, Tests_2, Tests_3]
runs-on: [self-hosted, style-checker-aarch64] runs-on: [self-hosted, style-checker-aarch64]
steps: steps:
- name: Check out repository code - name: Check out repository code
@ -168,7 +169,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF
python3 ./tests/ci/ci_buddy.py --check-wf-status python3 ./tests/ci/ci_buddy.py --check-wf-status

View File

@ -489,7 +489,7 @@ jobs:
- name: Check Workflow results - name: Check Workflow results
run: | run: |
export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json" export WORKFLOW_RESULT_FILE="/tmp/workflow_results.json"
cat >> "$WORKFLOW_RESULT_FILE" << 'EOF' cat > "$WORKFLOW_RESULT_FILE" << 'EOF'
${{ toJson(needs) }} ${{ toJson(needs) }}
EOF EOF

View File

@ -9,6 +9,7 @@ set(DATASKETCHES_LIBRARY theta)
add_library(_datasketches INTERFACE) add_library(_datasketches INTERFACE)
target_include_directories(_datasketches SYSTEM BEFORE INTERFACE target_include_directories(_datasketches SYSTEM BEFORE INTERFACE
"${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/common/include" "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/common/include"
"${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/count/include"
"${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/theta/include") "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/theta/include")
add_library(ch_contrib::datasketches ALIAS _datasketches) add_library(ch_contrib::datasketches ALIAS _datasketches)

View File

@ -6,7 +6,7 @@ ARG apt_archive="http://archive.ubuntu.com"
RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list
RUN apt-get update --yes \ RUN apt-get update --yes \
&& env DEBIAN_FRONTEND=noninteractive apt-get install wget git default-jdk maven python3 --yes --no-install-recommends \ && env DEBIAN_FRONTEND=noninteractive apt-get install wget git python3 default-jdk maven --yes --no-install-recommends \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*

View File

@ -999,6 +999,10 @@ They can be used for prewhere optimization only if we enable `set allow_statisti
[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) sketches which provide an estimation how many distinct values a column contains. [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) sketches which provide an estimation how many distinct values a column contains.
- `count_min`
[Count-min](https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch) sketches which provide an approximate count of the frequency of each value in a column.
## Column-level Settings {#column-level-settings} ## Column-level Settings {#column-level-settings}
Certain MergeTree settings can be override at column level: Certain MergeTree settings can be override at column level:

View File

@ -55,7 +55,7 @@ CMPLNT_FR_TM Nullable(String)
``` ```
:::tip :::tip
Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples. This is not always the case. Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](/docs/en/integrations/data-ingestion/data-formats/json.md#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric. By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000` Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples. This is not always the case. Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](/en/integrations/data-formats/json/inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric. By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
you can get a better idea of the content. you can get a better idea of the content.
Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled. Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.

View File

@ -7,7 +7,7 @@ keywords: [object, data type]
# Object Data Type (deprecated) # Object Data Type (deprecated)
**This feature is not production-ready and is now deprecated.** If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-ingestion/data-formats/json) instead. A new implementation to support JSON object is in progress and can be tracked [here](https://github.com/ClickHouse/ClickHouse/issues/54864). **This feature is not production-ready and is now deprecated.** If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-formats/json/overview) instead. A new implementation to support JSON object is in progress and can be tracked [here](https://github.com/ClickHouse/ClickHouse/issues/54864).
<hr /> <hr />

View File

@ -49,7 +49,7 @@ enum class QueryTreeNodeType : uint8_t
/// Convert query tree node type to string /// Convert query tree node type to string
const char * toString(QueryTreeNodeType type); const char * toString(QueryTreeNodeType type);
/** Query tree is semantical representation of query. /** Query tree is a semantic representation of query.
* Query tree node represent node in query tree. * Query tree node represent node in query tree.
* IQueryTreeNode is base class for all query tree nodes. * IQueryTreeNode is base class for all query tree nodes.
* *

View File

@ -543,7 +543,7 @@ if (TARGET ch_contrib::libpqxx)
endif() endif()
if (TARGET ch_contrib::datasketches) if (TARGET ch_contrib::datasketches)
target_link_libraries (clickhouse_aggregate_functions PRIVATE ch_contrib::datasketches) dbms_target_link_libraries(PUBLIC ch_contrib::datasketches)
endif () endif ()
target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::lz4) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::lz4)

View File

@ -33,7 +33,7 @@ size_t toMilliseconds(auto duration)
return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(); return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
} }
const auto epsilon = 500us; const auto epsilon = 1ms;
class ResolvePoolMock : public DB::HostResolver class ResolvePoolMock : public DB::HostResolver
{ {
@ -358,53 +358,59 @@ void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses
TEST_F(ResolvePoolTest, BannedForConsiquenceFail) TEST_F(ResolvePoolTest, BannedForConsiquenceFail)
{ {
auto history = 5ms; auto history = 10ms;
auto resolver = make_resolver(toMilliseconds(history)); auto resolver = make_resolver(toMilliseconds(history));
auto failed_addr = resolver->resolve(); auto failed_addr = resolver->resolve();
ASSERT_TRUE(addresses.contains(*failed_addr)); ASSERT_TRUE(addresses.contains(*failed_addr));
auto start_at = now();
failed_addr.setFail(); failed_addr.setFail();
auto start_at = now();
ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count));
ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count));
check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon);
sleep_until(start_at + history + epsilon); sleep_until(start_at + history + epsilon);
start_at = now();
resolver->update(); resolver->update();
ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count));
ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count)); ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count));
failed_addr.setFail(); failed_addr.setFail();
start_at = now();
check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon);
sleep_until(start_at + history + epsilon); sleep_until(start_at + history + epsilon);
start_at = now();
resolver->update(); resolver->update();
// too much time has passed
if (now() > start_at + 2*history - epsilon)
return;
ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count));
ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count));
// ip still banned adter history_ms + update, because it was his second consiquent fail // ip still banned adter history_ms + update, because it was his second consiquent fail
check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + 2*history - epsilon);
} }
TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail)
{ {
auto history = 5ms; auto history = 10ms;
auto resolver = make_resolver(toMilliseconds(history)); auto resolver = make_resolver(toMilliseconds(history));
auto failed_addr = resolver->resolve(); auto failed_addr = resolver->resolve();
ASSERT_TRUE(addresses.contains(*failed_addr)); ASSERT_TRUE(addresses.contains(*failed_addr));
auto start_at = now(); failed_addr.setFail();
failed_addr.setFail();
failed_addr.setFail();
failed_addr.setFail(); auto start_at = now();
failed_addr.setFail();
failed_addr.setFail();
ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count));
ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count));
@ -413,6 +419,7 @@ TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail)
sleep_until(start_at + history + epsilon); sleep_until(start_at + history + epsilon);
resolver->update(); resolver->update();
// ip is cleared after just 1 history_ms interval. // ip is cleared after just 1 history_ms interval.
ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count));
ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count)); ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count));

View File

@ -383,7 +383,10 @@ void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & co
LockMemoryExceptionInThread::removeUniqueLock(); LockMemoryExceptionInThread::removeUniqueLock();
}; };
asio_opts.thread_pool_size_ = getNumberOfPhysicalCPUCores(); /// At least 16 threads for network communication in asio.
/// asio is async framework, so even with 1 thread it should be ok, but
/// still as safeguard it's better to have some redundant capacity here
asio_opts.thread_pool_size_ = std::max(16U, getNumberOfPhysicalCPUCores());
if (state_manager->isSecure()) if (state_manager->isSecure())
{ {

View File

@ -125,23 +125,6 @@ DataTypePtr DataTypeFactory::getImpl(const String & family_name_param, const AST
{ {
String family_name = getAliasToOrName(family_name_param); String family_name = getAliasToOrName(family_name_param);
if (endsWith(family_name, "WithDictionary"))
{
ASTPtr low_cardinality_params = std::make_shared<ASTExpressionList>();
String param_name = family_name.substr(0, family_name.size() - strlen("WithDictionary"));
if (parameters)
{
auto func = std::make_shared<ASTFunction>();
func->name = param_name;
func->arguments = parameters;
low_cardinality_params->children.push_back(func);
}
else
low_cardinality_params->children.push_back(std::make_shared<ASTIdentifier>(param_name));
return getImpl<nullptr_on_error>("LowCardinality", low_cardinality_params);
}
const auto * creator = findCreatorByName<nullptr_on_error>(family_name); const auto * creator = findCreatorByName<nullptr_on_error>(family_name);
if constexpr (nullptr_on_error) if constexpr (nullptr_on_error)
{ {

View File

@ -739,7 +739,8 @@ public:
{ {
NumberType value; NumberType value;
tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, convert_bool_to_integer, error); if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, convert_bool_to_integer, error))
return false;
auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest); auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest);
col_vec.insertValue(value); col_vec.insertValue(value);
return true; return true;

View File

@ -17,13 +17,19 @@
namespace DB namespace DB
{ {
IInterpreterUnionOrSelectQuery::IInterpreterUnionOrSelectQuery(const DB::ASTPtr& query_ptr_, IInterpreterUnionOrSelectQuery::IInterpreterUnionOrSelectQuery(const ASTPtr & query_ptr_,
const DB::ContextMutablePtr& context_, const DB::SelectQueryOptions& options_) const ContextMutablePtr & context_, const SelectQueryOptions & options_)
: query_ptr(query_ptr_) : query_ptr(query_ptr_)
, context(context_) , context(context_)
, options(options_) , options(options_)
, max_streams(context->getSettingsRef().max_threads) , max_streams(context->getSettingsRef().max_threads)
{ {
/// FIXME All code here will work with the old analyzer, however for views over Distributed tables
/// it's possible that new analyzer will be enabled in ::getQueryProcessingStage method
/// of the underlying storage when all other parts of infrastructure are not ready for it
/// (built with old analyzer).
context->setSetting("allow_experimental_analyzer", false);
if (options.shard_num) if (options.shard_num)
context->addSpecialScalar( context->addSpecialScalar(
"_shard_num", "_shard_num",

View File

@ -75,7 +75,6 @@
#include <Storages/MergeTree/MergeTreeWhereOptimizer.h> #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
#include <Storages/StorageDistributed.h> #include <Storages/StorageDistributed.h>
#include <Storages/StorageDummy.h>
#include <Storages/StorageMerge.h> #include <Storages/StorageMerge.h>
#include <Storages/StorageValues.h> #include <Storages/StorageValues.h>
#include <Storages/StorageView.h> #include <Storages/StorageView.h>
@ -214,11 +213,11 @@ InterpreterSelectQuery::InterpreterSelectQuery(
{} {}
InterpreterSelectQuery::InterpreterSelectQuery( InterpreterSelectQuery::InterpreterSelectQuery(
const ASTPtr & query_ptr_, const ASTPtr & query_ptr_,
const ContextPtr & context_, const ContextPtr & context_,
Pipe input_pipe_, Pipe input_pipe_,
const SelectQueryOptions & options_) const SelectQueryOptions & options_)
: InterpreterSelectQuery(query_ptr_, context_, std::move(input_pipe_), nullptr, options_.copy().noSubquery()) : InterpreterSelectQuery(query_ptr_, context_, std::move(input_pipe_), nullptr, options_.copy().noSubquery())
{} {}
InterpreterSelectQuery::InterpreterSelectQuery( InterpreterSelectQuery::InterpreterSelectQuery(
@ -227,18 +226,15 @@ InterpreterSelectQuery::InterpreterSelectQuery(
const StoragePtr & storage_, const StoragePtr & storage_,
const StorageMetadataPtr & metadata_snapshot_, const StorageMetadataPtr & metadata_snapshot_,
const SelectQueryOptions & options_) const SelectQueryOptions & options_)
: InterpreterSelectQuery( : InterpreterSelectQuery(query_ptr_, context_, std::nullopt, storage_, options_.copy().noSubquery(), {}, metadata_snapshot_)
query_ptr_, context_, std::nullopt, storage_, options_.copy().noSubquery(), {}, metadata_snapshot_) {}
{
}
InterpreterSelectQuery::InterpreterSelectQuery( InterpreterSelectQuery::InterpreterSelectQuery(
const ASTPtr & query_ptr_, const ASTPtr & query_ptr_,
const ContextPtr & context_, const ContextPtr & context_,
const SelectQueryOptions & options_, const SelectQueryOptions & options_,
PreparedSetsPtr prepared_sets_) PreparedSetsPtr prepared_sets_)
: InterpreterSelectQuery( : InterpreterSelectQuery(query_ptr_, context_, std::nullopt, nullptr, options_, {}, {}, prepared_sets_)
query_ptr_, context_, std::nullopt, nullptr, options_, {}, {}, prepared_sets_)
{} {}
InterpreterSelectQuery::~InterpreterSelectQuery() = default; InterpreterSelectQuery::~InterpreterSelectQuery() = default;

View File

@ -26,7 +26,6 @@ class Logger;
namespace DB namespace DB
{ {
class SubqueryForSet;
class InterpreterSelectWithUnionQuery; class InterpreterSelectWithUnionQuery;
class Context; class Context;
class QueryPlan; class QueryPlan;

View File

@ -545,7 +545,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID
catch (Exception & e) catch (Exception & e)
{ {
if (e.code() == ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE) if (e.code() == ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE)
throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot convert string {} to type {}", src.get<String>(), type.getName()); throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot convert string '{}' to type {}", src.get<String>(), type.getName());
e.addMessage(fmt::format("while converting '{}' to {}", src.get<String>(), type.getName())); e.addMessage(fmt::format("while converting '{}' to {}", src.get<String>(), type.getName()));
throw; throw;

View File

@ -147,7 +147,7 @@ INSTANTIATE_TEST_SUITE_P(
DecimalField(DateTime64(123 * Day * 1'000'000), 6) DecimalField(DateTime64(123 * Day * 1'000'000), 6)
} }
}) })
); );
INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(
DateTimeToDateTime64, DateTimeToDateTime64,
@ -179,3 +179,84 @@ INSTANTIATE_TEST_SUITE_P(
}, },
}) })
); );
INSTANTIATE_TEST_SUITE_P(
StringToNumber,
ConvertFieldToTypeTest,
::testing::ValuesIn(std::initializer_list<ConvertFieldToTypeTestParams>{
{
"String",
Field("1"),
"Int8",
Field(1)
},
{
"String",
Field("256"),
"Int8",
Field()
},
{
"String",
Field("not a number"),
"Int8",
{}
},
{
"String",
Field("1.1"),
"Int8",
{} /// we can not convert '1.1' to Int8
},
{
"String",
Field("1.1"),
"Float64",
Field(1.1)
},
})
);
INSTANTIATE_TEST_SUITE_P(
NumberToString,
ConvertFieldToTypeTest,
::testing::ValuesIn(std::initializer_list<ConvertFieldToTypeTestParams>{
{
"Int8",
Field(1),
"String",
Field("1")
},
{
"Int8",
Field(-1),
"String",
Field("-1")
},
{
"Float64",
Field(1.1),
"String",
Field("1.1")
},
})
);
INSTANTIATE_TEST_SUITE_P(
StringToDate,
ConvertFieldToTypeTest,
::testing::ValuesIn(std::initializer_list<ConvertFieldToTypeTestParams>{
{
"String",
Field("2024-07-12"),
"Date",
Field(static_cast<UInt16>(19916))
},
{
"String",
Field("not a date"),
"Date",
{}
},
})
);

View File

@ -9,7 +9,7 @@ namespace DB
{ {
/** The SELECT subquery is in parenthesis. /** The SELECT subquery, in parentheses.
*/ */
class ParserSubquery : public IParserBase class ParserSubquery : public IParserBase
{ {

View File

@ -11,15 +11,12 @@
namespace DB namespace DB
{ {
bool ParserDescribeTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) bool ParserDescribeTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{ {
ParserKeyword s_describe(Keyword::DESCRIBE); ParserKeyword s_describe(Keyword::DESCRIBE);
ParserKeyword s_desc(Keyword::DESC); ParserKeyword s_desc(Keyword::DESC);
ParserKeyword s_table(Keyword::TABLE); ParserKeyword s_table(Keyword::TABLE);
ParserKeyword s_settings(Keyword::SETTINGS); ParserKeyword s_settings(Keyword::SETTINGS);
ParserToken s_dot(TokenType::Dot);
ParserIdentifier name_p;
ParserSetQuery parser_settings(true); ParserSetQuery parser_settings(true);
ASTPtr database; ASTPtr database;
@ -53,5 +50,4 @@ bool ParserDescribeTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & ex
return true; return true;
} }
} }

View File

@ -1,48 +1,48 @@
#include <Interpreters/AsynchronousInsertQueue.h>
#include <Interpreters/Squashing.h>
#include <Parsers/ASTInsertQuery.h>
#include <algorithm> #include <algorithm>
#include <exception> #include <exception>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <vector>
#include <string_view> #include <string_view>
#include <Poco/Net/NetException.h> #include <vector>
#include <Poco/Net/SocketAddress.h> #include <Access/AccessControl.h>
#include <Poco/Util/LayeredConfiguration.h> #include <Access/Credentials.h>
#include <Common/CurrentThread.h>
#include <Common/Stopwatch.h>
#include <Common/NetException.h>
#include <Common/setThreadName.h>
#include <Common/OpenSSLHelpers.h>
#include <IO/Progress.h>
#include <Compression/CompressedReadBuffer.h> #include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedWriteBuffer.h> #include <Compression/CompressedWriteBuffer.h>
#include <IO/ReadBufferFromPocoSocket.h> #include <Compression/CompressionFactory.h>
#include <IO/WriteBufferFromPocoSocket.h> #include <Core/ExternalTable.h>
#include <IO/LimitReadBuffer.h> #include <Core/ServerSettings.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Formats/NativeReader.h> #include <Formats/NativeReader.h>
#include <Formats/NativeWriter.h> #include <Formats/NativeWriter.h>
#include <Interpreters/executeQuery.h> #include <IO/LimitReadBuffer.h>
#include <Interpreters/TablesStatus.h> #include <IO/Progress.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/AsynchronousInsertQueue.h>
#include <Interpreters/InternalTextLogsQueue.h> #include <Interpreters/InternalTextLogsQueue.h>
#include <Interpreters/OpenTelemetrySpanLog.h> #include <Interpreters/OpenTelemetrySpanLog.h>
#include <Interpreters/Session.h> #include <Interpreters/Session.h>
#include <Interpreters/Squashing.h>
#include <Interpreters/TablesStatus.h>
#include <Interpreters/executeQuery.h>
#include <Parsers/ASTInsertQuery.h>
#include <Server/TCPServer.h> #include <Server/TCPServer.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Storages/MergeTree/MergeTreeDataPartUUID.h> #include <Storages/MergeTree/MergeTreeDataPartUUID.h>
#include <Storages/ObjectStorage/StorageObjectStorageCluster.h> #include <Storages/ObjectStorage/StorageObjectStorageCluster.h>
#include <Core/ExternalTable.h> #include <Storages/StorageReplicatedMergeTree.h>
#include <Core/ServerSettings.h> #include <Poco/Net/NetException.h>
#include <Access/AccessControl.h> #include <Poco/Net/SocketAddress.h>
#include <Access/Credentials.h> #include <Poco/Util/LayeredConfiguration.h>
#include <Compression/CompressionFactory.h>
#include <Common/logger_useful.h>
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/CurrentThread.h>
#include <Common/NetException.h>
#include <Common/OpenSSLHelpers.h>
#include <Common/Stopwatch.h>
#include <Common/logger_useful.h>
#include <Common/scope_guard_safe.h>
#include <Common/setThreadName.h>
#include <Common/thread_local_rng.h> #include <Common/thread_local_rng.h>
#include <fmt/format.h>
#include <Processors/Executors/PullingAsyncPipelineExecutor.h> #include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Processors/Executors/PushingPipelineExecutor.h> #include <Processors/Executors/PushingPipelineExecutor.h>
@ -61,6 +61,8 @@
#include <Common/config_version.h> #include <Common/config_version.h>
#include <fmt/format.h>
using namespace std::literals; using namespace std::literals;
using namespace DB; using namespace DB;
@ -1036,6 +1038,17 @@ void TCPHandler::processOrdinaryQuery()
PullingAsyncPipelineExecutor executor(pipeline); PullingAsyncPipelineExecutor executor(pipeline);
CurrentMetrics::Increment query_thread_metric_increment{CurrentMetrics::QueryThread}; CurrentMetrics::Increment query_thread_metric_increment{CurrentMetrics::QueryThread};
/// The following may happen:
/// * current thread is holding the lock
/// * because of the exception we unwind the stack and call the destructor of `executor`
/// * the destructor calls cancel() and waits for all query threads to finish
/// * at the same time one of the query threads is trying to acquire the lock, e.g. inside `merge_tree_read_task_callback`
/// * deadlock
SCOPE_EXIT({
if (out_lock.owns_lock())
out_lock.unlock();
});
Block block; Block block;
while (executor.pull(block, interactive_delay / 1000)) while (executor.pull(block, interactive_delay / 1000))
{ {
@ -1079,8 +1092,7 @@ void TCPHandler::processOrdinaryQuery()
} }
/// This lock wasn't acquired before and we make .lock() call here /// This lock wasn't acquired before and we make .lock() call here
/// so everything under this line is covered even together /// so everything under this line is covered.
/// with sendProgress() out of the scope
out_lock.lock(); out_lock.lock();
/** If data has run out, we will send the profiling data and total values to /** If data has run out, we will send the profiling data and total values to
@ -1107,6 +1119,7 @@ void TCPHandler::processOrdinaryQuery()
last_sent_snapshots.clear(); last_sent_snapshots.clear();
} }
out_lock.lock();
sendProgress(); sendProgress();
} }

View File

@ -304,7 +304,7 @@ void RefreshTask::refreshTask()
{ {
PreformattedMessage message = getCurrentExceptionMessageAndPattern(true); PreformattedMessage message = getCurrentExceptionMessageAndPattern(true);
auto text = message.text; auto text = message.text;
message.text = fmt::format("Refresh failed: {}", message.text); message.text = fmt::format("Refresh view {} failed: {}", view->getStorageID().getFullTableName(), message.text);
LOG_ERROR(log, message); LOG_ERROR(log, message);
exception = text; exception = text;
} }

View File

@ -16,6 +16,7 @@
#include <Storages/MergeTree/DataPartStorageOnDiskFull.h> #include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeSettings.h> #include <Storages/MergeTree/MergeTreeSettings.h>
#include <Storages/MergeTree/checkDataPart.h>
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/NetException.h> #include <Common/NetException.h>
#include <Common/randomDelay.h> #include <Common/randomDelay.h>
@ -224,14 +225,18 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write
} }
catch (const Exception & e) catch (const Exception & e)
{ {
if (e.code() != ErrorCodes::ABORTED && e.code() != ErrorCodes::CANNOT_WRITE_TO_OSTREAM) if (e.code() != ErrorCodes::CANNOT_WRITE_TO_OSTREAM
&& !isRetryableException(std::current_exception()))
{
report_broken_part(); report_broken_part();
}
throw; throw;
} }
catch (...) catch (...)
{ {
report_broken_part(); if (!isRetryableException(std::current_exception()))
report_broken_part();
throw; throw;
} }
} }

View File

@ -499,8 +499,9 @@ ConditionSelectivityEstimator MergeTreeData::getConditionSelectivityEstimatorByP
{ {
auto stats = part->loadStatistics(); auto stats = part->loadStatistics();
/// TODO: We only have one stats file for every part. /// TODO: We only have one stats file for every part.
result.addRows(part->rows_count);
for (const auto & stat : stats) for (const auto & stat : stats)
result.merge(part->info.getPartNameV1(), part->rows_count, stat); result.merge(part->info.getPartNameV1(), stat);
} }
catch (...) catch (...)
{ {
@ -515,8 +516,9 @@ ConditionSelectivityEstimator MergeTreeData::getConditionSelectivityEstimatorByP
if (!partition_pruner.canBePruned(*part)) if (!partition_pruner.canBePruned(*part))
{ {
auto stats = part->loadStatistics(); auto stats = part->loadStatistics();
result.addRows(part->rows_count);
for (const auto & stat : stats) for (const auto & stat : stats)
result.merge(part->info.getPartNameV1(), part->rows_count, stat); result.merge(part->info.getPartNameV1(), stat);
} }
} }
catch (...) catch (...)

View File

@ -15,16 +15,11 @@
#include <Processors/QueryPlan/FilterStep.h> #include <Processors/QueryPlan/FilterStep.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include <Processors/Merges/Algorithms/MergeTreePartLevelInfo.h> #include <Processors/Merges/Algorithms/MergeTreePartLevelInfo.h>
#include <Storages/MergeTree/checkDataPart.h>
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int MEMORY_LIMIT_EXCEEDED;
}
/// Lightweight (in terms of logic) stream for reading single part from /// Lightweight (in terms of logic) stream for reading single part from
/// MergeTree, used for merges and mutations. /// MergeTree, used for merges and mutations.
/// ///
@ -281,7 +276,7 @@ try
catch (...) catch (...)
{ {
/// Suspicion of the broken part. A part is added to the queue for verification. /// Suspicion of the broken part. A part is added to the queue for verification.
if (getCurrentExceptionCode() != ErrorCodes::MEMORY_LIMIT_EXCEEDED) if (!isRetryableException(std::current_exception()))
storage.reportBrokenPart(data_part); storage.reportBrokenPart(data_part);
throw; throw;
} }

View File

@ -36,11 +36,13 @@ namespace ErrorCodes
extern const int CANNOT_ALLOCATE_MEMORY; extern const int CANNOT_ALLOCATE_MEMORY;
extern const int CANNOT_MUNMAP; extern const int CANNOT_MUNMAP;
extern const int CANNOT_MREMAP; extern const int CANNOT_MREMAP;
extern const int CANNOT_SCHEDULE_TASK;
extern const int UNEXPECTED_FILE_IN_DATA_PART; extern const int UNEXPECTED_FILE_IN_DATA_PART;
extern const int NO_FILE_IN_DATA_PART; extern const int NO_FILE_IN_DATA_PART;
extern const int NETWORK_ERROR; extern const int NETWORK_ERROR;
extern const int SOCKET_TIMEOUT; extern const int SOCKET_TIMEOUT;
extern const int BROKEN_PROJECTION; extern const int BROKEN_PROJECTION;
extern const int ABORTED;
} }
@ -85,7 +87,9 @@ bool isRetryableException(std::exception_ptr exception_ptr)
{ {
return isNotEnoughMemoryErrorCode(e.code()) return isNotEnoughMemoryErrorCode(e.code())
|| e.code() == ErrorCodes::NETWORK_ERROR || e.code() == ErrorCodes::NETWORK_ERROR
|| e.code() == ErrorCodes::SOCKET_TIMEOUT; || e.code() == ErrorCodes::SOCKET_TIMEOUT
|| e.code() == ErrorCodes::CANNOT_SCHEDULE_TASK
|| e.code() == ErrorCodes::ABORTED;
} }
catch (const Poco::Net::NetException &) catch (const Poco::Net::NetException &)
{ {
@ -329,16 +333,21 @@ static IMergeTreeDataPart::Checksums checkDataPart(
projections_on_disk.erase(projection_file); projections_on_disk.erase(projection_file);
} }
if (throw_on_broken_projection && !broken_projections_message.empty()) if (throw_on_broken_projection)
{ {
throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message); if (!broken_projections_message.empty())
} {
throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message);
}
if (require_checksums && !projections_on_disk.empty()) /// This one is actually not broken, just redundant files on disk which
{ /// MergeTree will never use.
throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART, if (require_checksums && !projections_on_disk.empty())
"Found unexpected projection directories: {}", {
fmt::join(projections_on_disk, ",")); throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART,
"Found unexpected projection directories: {}",
fmt::join(projections_on_disk, ","));
}
} }
if (is_cancelled()) if (is_cancelled())

View File

@ -163,7 +163,9 @@ ReadBufferIterator::Data ReadBufferIterator::next()
{ {
for (const auto & object_info : read_keys) for (const auto & object_info : read_keys)
{ {
if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->getFileName())) auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->getFileName());
/// Use this format only if we have a schema reader for it.
if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name))
{ {
format = format_from_file_name; format = format_from_file_name;
break; break;
@ -221,7 +223,9 @@ ReadBufferIterator::Data ReadBufferIterator::next()
{ {
for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it)
{ {
if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->getFileName())) auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->getFileName());
/// Use this format only if we have a schema reader for it.
if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name))
{ {
format = format_from_file_name; format = format_from_file_name;
break; break;

View File

@ -16,7 +16,7 @@ void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String par
part_statistics[part_name] = stats; part_statistics[part_name] = stats;
} }
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(Float64 val, Float64 rows) const Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(const Field & val, Float64 rows) const
{ {
if (part_statistics.empty()) if (part_statistics.empty())
return default_normal_cond_factor * rows; return default_normal_cond_factor * rows;
@ -30,16 +30,19 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(
return result * rows / part_rows; return result * rows / part_rows;
} }
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreater(Float64 val, Float64 rows) const Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreater(const Field & val, Float64 rows) const
{ {
return rows - estimateLess(val, rows); return rows - estimateLess(val, rows);
} }
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(Float64 val, Float64 rows) const Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(const Field & val, Float64 rows) const
{ {
if (part_statistics.empty()) if (part_statistics.empty())
{ {
if (val < - threshold || val > threshold) auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (!float_val)
return default_unknown_cond_factor * rows;
else if (float_val.value() < - threshold || float_val.value() > threshold)
return default_normal_cond_factor * rows; return default_normal_cond_factor * rows;
else else
return default_good_cond_factor * rows; return default_good_cond_factor * rows;
@ -87,7 +90,7 @@ static std::pair<String, Int32> tryToExtractSingleColumn(const RPNBuilderTreeNod
return result; return result;
} }
std::pair<String, Float64> ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const std::pair<String, Field> ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const
{ {
if (!node.isFunction()) if (!node.isFunction())
return {}; return {};
@ -123,48 +126,35 @@ std::pair<String, Float64> ConditionSelectivityEstimator::extractBinaryOp(const
DataTypePtr output_type; DataTypePtr output_type;
if (!constant_node->tryGetConstant(output_value, output_type)) if (!constant_node->tryGetConstant(output_value, output_type))
return {}; return {};
return std::make_pair(function_name, output_value);
const auto type = output_value.getType();
Float64 value;
if (type == Field::Types::Int64)
value = output_value.get<Int64>();
else if (type == Field::Types::UInt64)
value = output_value.get<UInt64>();
else if (type == Field::Types::Float64)
value = output_value.get<Float64>();
else
return {};
return std::make_pair(function_name, value);
} }
Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const
{ {
auto result = tryToExtractSingleColumn(node); auto result = tryToExtractSingleColumn(node);
if (result.second != 1) if (result.second != 1)
{ return default_unknown_cond_factor * total_rows;
return default_unknown_cond_factor;
}
String col = result.first; String col = result.first;
auto it = column_estimators.find(col); auto it = column_estimators.find(col);
/// If there the estimator of the column is not found or there are no data at all, /// If there the estimator of the column is not found or there are no data at all,
/// we use dummy estimation. /// we use dummy estimation.
bool dummy = total_rows == 0; bool dummy = false;
ColumnSelectivityEstimator estimator; ColumnSelectivityEstimator estimator;
if (it != column_estimators.end()) if (it != column_estimators.end())
{
estimator = it->second; estimator = it->second;
}
else else
{
dummy = true; dummy = true;
}
auto [op, val] = extractBinaryOp(node, col); auto [op, val] = extractBinaryOp(node, col);
if (op == "equals") if (op == "equals")
{ {
if (dummy) if (dummy)
{ {
if (val < - threshold || val > threshold) auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (!float_val || (float_val < - threshold || float_val > threshold))
return default_normal_cond_factor * total_rows; return default_normal_cond_factor * total_rows;
else else
return default_good_cond_factor * total_rows; return default_good_cond_factor * total_rows;
@ -187,13 +177,8 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode
return default_unknown_cond_factor * total_rows; return default_unknown_cond_factor * total_rows;
} }
void ConditionSelectivityEstimator::merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat) void ConditionSelectivityEstimator::merge(String part_name, ColumnStatisticsPtr column_stat)
{ {
if (!part_names.contains(part_name))
{
total_rows += part_rows;
part_names.insert(part_name);
}
if (column_stat != nullptr) if (column_stat != nullptr)
column_estimators[column_stat->columnName()].merge(part_name, column_stat); column_estimators[column_stat->columnName()].merge(part_name, column_stat);
} }

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <Storages/Statistics/Statistics.h> #include <Storages/Statistics/Statistics.h>
#include <Core/Field.h>
namespace DB namespace DB
{ {
@ -10,6 +11,14 @@ class RPNBuilderTreeNode;
/// It estimates the selectivity of a condition. /// It estimates the selectivity of a condition.
class ConditionSelectivityEstimator class ConditionSelectivityEstimator
{ {
public:
/// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ...
/// Right now we only support simple condition like col = val / col < val
Float64 estimateRowCount(const RPNBuilderTreeNode & node) const;
void merge(String part_name, ColumnStatisticsPtr column_stat);
void addRows(UInt64 part_rows) { total_rows += part_rows; }
private: private:
friend class ColumnStatistics; friend class ColumnStatistics;
struct ColumnSelectivityEstimator struct ColumnSelectivityEstimator
@ -20,13 +29,15 @@ private:
void merge(String part_name, ColumnStatisticsPtr stats); void merge(String part_name, ColumnStatisticsPtr stats);
Float64 estimateLess(Float64 val, Float64 rows) const; Float64 estimateLess(const Field & val, Float64 rows) const;
Float64 estimateGreater(Float64 val, Float64 rows) const; Float64 estimateGreater(const Field & val, Float64 rows) const;
Float64 estimateEqual(Float64 val, Float64 rows) const; Float64 estimateEqual(const Field & val, Float64 rows) const;
}; };
std::pair<String, Field> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;
static constexpr auto default_good_cond_factor = 0.1; static constexpr auto default_good_cond_factor = 0.1;
static constexpr auto default_normal_cond_factor = 0.5; static constexpr auto default_normal_cond_factor = 0.5;
static constexpr auto default_unknown_cond_factor = 1.0; static constexpr auto default_unknown_cond_factor = 1.0;
@ -35,16 +46,7 @@ private:
static constexpr auto threshold = 2; static constexpr auto threshold = 2;
UInt64 total_rows = 0; UInt64 total_rows = 0;
std::set<String> part_names;
std::map<String, ColumnSelectivityEstimator> column_estimators; std::map<String, ColumnSelectivityEstimator> column_estimators;
std::pair<String, Float64> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;
public:
/// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ...
/// Right now we only support simple condition like col = val / col < val
Float64 estimateRowCount(const RPNBuilderTreeNode & node) const;
void merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat);
}; };
} }

View File

@ -1,15 +1,18 @@
#include <Storages/Statistics/Statistics.h> #include <Storages/Statistics/Statistics.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Storages/ColumnsDescription.h>
#include <Storages/Statistics/ConditionSelectivityEstimator.h> #include <Storages/Statistics/ConditionSelectivityEstimator.h>
#include <Storages/Statistics/StatisticsCountMinSketch.h>
#include <Storages/Statistics/StatisticsTDigest.h> #include <Storages/Statistics/StatisticsTDigest.h>
#include <Storages/Statistics/StatisticsUniq.h> #include <Storages/Statistics/StatisticsUniq.h>
#include <Storages/StatisticsDescription.h> #include <Storages/StatisticsDescription.h>
#include <Storages/ColumnsDescription.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Common/Exception.h> #include <Common/Exception.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include "config.h" /// USE_DATASKETCHES
namespace DB namespace DB
{ {
@ -24,6 +27,36 @@ enum StatisticsFileVersion : UInt16
V0 = 0, V0 = 0,
}; };
std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & field)
{
switch (field.getType())
{
case Field::Types::Int64:
return field.get<Int64>();
case Field::Types::UInt64:
return field.get<UInt64>();
case Field::Types::Float64:
return field.get<Float64>();
case Field::Types::Int128:
return field.get<Int128>();
case Field::Types::UInt128:
return field.get<UInt128>();
case Field::Types::Int256:
return field.get<Int256>();
case Field::Types::UInt256:
return field.get<UInt256>();
default:
return {};
}
}
std::optional<String> StatisticsUtils::tryConvertToString(const DB::Field & field)
{
if (field.getType() == Field::Types::String)
return field.get<String>();
return {};
}
IStatistics::IStatistics(const SingleStatisticsDescription & stat_) IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
: stat(stat_) : stat(stat_)
{ {
@ -46,12 +79,12 @@ UInt64 IStatistics::estimateCardinality() const
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cardinality estimation is not implemented for this type of statistics"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Cardinality estimation is not implemented for this type of statistics");
} }
Float64 IStatistics::estimateEqual(Float64 /*val*/) const Float64 IStatistics::estimateEqual(const Field & /*val*/) const
{ {
throw Exception(ErrorCodes::LOGICAL_ERROR, "Equality estimation is not implemented for this type of statistics"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Equality estimation is not implemented for this type of statistics");
} }
Float64 IStatistics::estimateLess(Float64 /*val*/) const Float64 IStatistics::estimateLess(const Field & /*val*/) const
{ {
throw Exception(ErrorCodes::LOGICAL_ERROR, "Less-than estimation is not implemented for this type of statistics"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Less-than estimation is not implemented for this type of statistics");
} }
@ -66,27 +99,32 @@ Float64 IStatistics::estimateLess(Float64 /*val*/) const
/// For that reason, all estimation are performed in a central place (here), and we don't simply pass the predicate to the first statistics /// For that reason, all estimation are performed in a central place (here), and we don't simply pass the predicate to the first statistics
/// object that supports it natively. /// object that supports it natively.
Float64 ColumnStatistics::estimateLess(Float64 val) const Float64 ColumnStatistics::estimateLess(const Field & val) const
{ {
if (stats.contains(StatisticsType::TDigest)) if (stats.contains(StatisticsType::TDigest))
return stats.at(StatisticsType::TDigest)->estimateLess(val); return stats.at(StatisticsType::TDigest)->estimateLess(val);
return rows * ConditionSelectivityEstimator::default_normal_cond_factor; return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
} }
Float64 ColumnStatistics::estimateGreater(Float64 val) const Float64 ColumnStatistics::estimateGreater(const Field & val) const
{ {
return rows - estimateLess(val); return rows - estimateLess(val);
} }
Float64 ColumnStatistics::estimateEqual(Float64 val) const Float64 ColumnStatistics::estimateEqual(const Field & val) const
{ {
if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (float_val.has_value() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
{ {
/// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket. /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048) if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
return stats.at(StatisticsType::TDigest)->estimateEqual(val); return stats.at(StatisticsType::TDigest)->estimateEqual(val);
} }
if (val < - ConditionSelectivityEstimator::threshold || val > ConditionSelectivityEstimator::threshold) #if USE_DATASKETCHES
if (stats.contains(StatisticsType::CountMinSketch))
return stats.at(StatisticsType::CountMinSketch)->estimateEqual(val);
#endif
if (!float_val.has_value() && (float_val < - ConditionSelectivityEstimator::threshold || float_val > ConditionSelectivityEstimator::threshold))
return rows * ConditionSelectivityEstimator::default_normal_cond_factor; return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
else else
return rows * ConditionSelectivityEstimator::default_good_cond_factor; return rows * ConditionSelectivityEstimator::default_good_cond_factor;
@ -166,11 +204,16 @@ void MergeTreeStatisticsFactory::registerValidator(StatisticsType stats_type, Va
MergeTreeStatisticsFactory::MergeTreeStatisticsFactory() MergeTreeStatisticsFactory::MergeTreeStatisticsFactory()
{ {
registerValidator(StatisticsType::TDigest, TDigestValidator); registerValidator(StatisticsType::TDigest, tdigestValidator);
registerCreator(StatisticsType::TDigest, TDigestCreator); registerCreator(StatisticsType::TDigest, tdigestCreator);
registerValidator(StatisticsType::Uniq, UniqValidator); registerValidator(StatisticsType::Uniq, uniqValidator);
registerCreator(StatisticsType::Uniq, UniqCreator); registerCreator(StatisticsType::Uniq, uniqCreator);
#if USE_DATASKETCHES
registerValidator(StatisticsType::CountMinSketch, countMinSketchValidator);
registerCreator(StatisticsType::CountMinSketch, countMinSketchCreator);
#endif
} }
MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance()
@ -197,7 +240,7 @@ ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnStatisticsDescri
{ {
auto it = creators.find(type); auto it = creators.find(type);
if (it == creators.end()) if (it == creators.end())
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type '{}'. Available types: 'tdigest' 'uniq'", type); throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type '{}'. Available types: 'tdigest' 'uniq' and 'count_min'", type);
auto stat_ptr = (it->second)(desc, stats.data_type); auto stat_ptr = (it->second)(desc, stats.data_type);
column_stat->stats[type] = stat_ptr; column_stat->stats[type] = stat_ptr;
} }

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <Core/Block.h> #include <Core/Block.h>
#include <Core/Field.h>
#include <IO/ReadBuffer.h> #include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h> #include <IO/WriteBuffer.h>
#include <Storages/StatisticsDescription.h> #include <Storages/StatisticsDescription.h>
@ -13,6 +14,14 @@ namespace DB
constexpr auto STATS_FILE_PREFIX = "statistics_"; constexpr auto STATS_FILE_PREFIX = "statistics_";
constexpr auto STATS_FILE_SUFFIX = ".stats"; constexpr auto STATS_FILE_SUFFIX = ".stats";
struct StatisticsUtils
{
/// Returns std::nullopt if input Field cannot be converted to a concrete value
static std::optional<Float64> tryConvertToFloat64(const Field & field);
static std::optional<String> tryConvertToString(const Field & field);
};
/// Statistics describe properties of the values in the column, /// Statistics describe properties of the values in the column,
/// e.g. how many unique values exist, /// e.g. how many unique values exist,
/// what are the N most frequent values, /// what are the N most frequent values,
@ -34,8 +43,8 @@ public:
/// Per-value estimations. /// Per-value estimations.
/// Throws if the statistics object is not able to do a meaningful estimation. /// Throws if the statistics object is not able to do a meaningful estimation.
virtual Float64 estimateEqual(Float64 val) const; /// cardinality of val in the column virtual Float64 estimateEqual(const Field & val) const; /// cardinality of val in the column
virtual Float64 estimateLess(Float64 val) const; /// summarized cardinality of values < val in the column virtual Float64 estimateLess(const Field & val) const; /// summarized cardinality of values < val in the column
protected: protected:
SingleStatisticsDescription stat; SingleStatisticsDescription stat;
@ -58,9 +67,9 @@ public:
void update(const ColumnPtr & column); void update(const ColumnPtr & column);
Float64 estimateLess(Float64 val) const; Float64 estimateLess(const Field & val) const;
Float64 estimateGreater(Float64 val) const; Float64 estimateGreater(const Field & val) const;
Float64 estimateEqual(Float64 val) const; Float64 estimateEqual(const Field & val) const;
private: private:
friend class MergeTreeStatisticsFactory; friend class MergeTreeStatisticsFactory;

View File

@ -0,0 +1,102 @@
#include <Storages/Statistics/StatisticsCountMinSketch.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/convertFieldToType.h>
#if USE_DATASKETCHES
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_STATISTICS;
}
/// Constants chosen based on rolling dices.
/// The values provides:
/// 1. an error tolerance of 0.1% (ε = 0.001)
/// 2. a confidence level of 99.9% (δ = 0.001).
/// And sketch the size is 152kb.
static constexpr auto num_hashes = 7uz;
static constexpr auto num_buckets = 2718uz;
StatisticsCountMinSketch::StatisticsCountMinSketch(const SingleStatisticsDescription & stat_, DataTypePtr data_type_)
: IStatistics(stat_)
, sketch(num_hashes, num_buckets)
, data_type(data_type_)
{
}
Float64 StatisticsCountMinSketch::estimateEqual(const Field & val) const
{
/// Try to convert field to data_type. Converting string to proper data types such as: number, date, datetime, IPv4, Decimal etc.
/// Return null if val larger than the range of data_type
///
/// For example: if data_type is Int32:
/// 1. For 1.0, 1, '1', return Field(1)
/// 2. For 1.1, max_value_int64, return null
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
return 0;
if (data_type->isValueRepresentedByNumber())
return sketch.get_estimate(&val_converted, data_type->getSizeOfValueInMemory());
if (isStringOrFixedString(data_type))
return sketch.get_estimate(val.get<String>());
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'count_min' does not support estimate data type of {}", data_type->getName());
}
void StatisticsCountMinSketch::update(const ColumnPtr & column)
{
for (size_t row = 0; row < column->size(); ++row)
{
if (column->isNullAt(row))
continue;
auto data = column->getDataAt(row);
sketch.update(data.data, data.size, 1);
}
}
void StatisticsCountMinSketch::serialize(WriteBuffer & buf)
{
Sketch::vector_bytes bytes = sketch.serialize();
writeIntBinary(static_cast<UInt64>(bytes.size()), buf);
buf.write(reinterpret_cast<const char *>(bytes.data()), bytes.size());
}
void StatisticsCountMinSketch::deserialize(ReadBuffer & buf)
{
UInt64 size;
readIntBinary(size, buf);
Sketch::vector_bytes bytes;
bytes.resize(size); /// To avoid 'container-overflow' in AddressSanitizer checking
buf.readStrict(reinterpret_cast<char *>(bytes.data()), size);
sketch = Sketch::deserialize(bytes.data(), size);
}
void countMinSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
{
data_type = removeNullable(data_type);
data_type = removeLowCardinalityAndNullable(data_type);
if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type))
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'count_min' does not support type {}", data_type->getName());
}
StatisticsPtr countMinSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
{
return std::make_shared<StatisticsCountMinSketch>(stat, data_type);
}
}
#endif

View File

@ -0,0 +1,39 @@
#pragma once
#include <Storages/Statistics/Statistics.h>
#include "config.h"
#if USE_DATASKETCHES
#include <count_min.hpp>
namespace DB
{
class StatisticsCountMinSketch : public IStatistics
{
public:
StatisticsCountMinSketch(const SingleStatisticsDescription & stat_, DataTypePtr data_type_);
Float64 estimateEqual(const Field & val) const override;
void update(const ColumnPtr & column) override;
void serialize(WriteBuffer & buf) override;
void deserialize(ReadBuffer & buf) override;
private:
using Sketch = datasketches::count_min_sketch<UInt64>;
Sketch sketch;
DataTypePtr data_type;
};
void countMinSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
StatisticsPtr countMinSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr);
}
#endif

View File

@ -1,11 +1,13 @@
#include <Storages/Statistics/StatisticsTDigest.h> #include <Storages/Statistics/StatisticsTDigest.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB namespace DB
{ {
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int ILLEGAL_STATISTICS; extern const int ILLEGAL_STATISTICS;
extern const int LOGICAL_ERROR;
} }
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_) StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
@ -16,12 +18,16 @@ StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
void StatisticsTDigest::update(const ColumnPtr & column) void StatisticsTDigest::update(const ColumnPtr & column)
{ {
size_t rows = column->size(); size_t rows = column->size();
for (size_t row = 0; row < rows; ++row) for (size_t row = 0; row < rows; ++row)
{ {
/// TODO: support more types. Field field;
Float64 value = column->getFloat64(row); column->get(row, field);
t_digest.add(value, 1);
if (field.isNull())
continue;
if (auto field_as_float = StatisticsUtils::tryConvertToFloat64(field))
t_digest.add(*field_as_float, 1);
} }
} }
@ -35,24 +41,31 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
t_digest.deserialize(buf); t_digest.deserialize(buf);
} }
Float64 StatisticsTDigest::estimateLess(Float64 val) const Float64 StatisticsTDigest::estimateLess(const Field & val) const
{ {
return t_digest.getCountLessThan(val); auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
if (val_as_float)
return t_digest.getCountLessThan(*val_as_float);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
} }
Float64 StatisticsTDigest::estimateEqual(Float64 val) const Float64 StatisticsTDigest::estimateEqual(const Field & val) const
{ {
return t_digest.getCountEqual(val); auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
if (val_as_float)
return t_digest.getCountEqual(*val_as_float);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
} }
void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type) void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
{ {
data_type = removeNullable(data_type); data_type = removeNullable(data_type);
data_type = removeLowCardinalityAndNullable(data_type);
if (!data_type->isValueRepresentedByNumber()) if (!data_type->isValueRepresentedByNumber())
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName()); throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
} }
StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr) StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
{ {
return std::make_shared<StatisticsTDigest>(stat); return std::make_shared<StatisticsTDigest>(stat);
} }

View File

@ -16,14 +16,14 @@ public:
void serialize(WriteBuffer & buf) override; void serialize(WriteBuffer & buf) override;
void deserialize(ReadBuffer & buf) override; void deserialize(ReadBuffer & buf) override;
Float64 estimateLess(Float64 val) const override; Float64 estimateLess(const Field & val) const override;
Float64 estimateEqual(Float64 val) const override; Float64 estimateEqual(const Field & val) const override;
private: private:
QuantileTDigest<Float64> t_digest; QuantileTDigest<Float64> t_digest;
}; };
void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type); void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr); StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr);
} }

View File

@ -1,6 +1,7 @@
#include <Storages/Statistics/StatisticsUniq.h> #include <Storages/Statistics/StatisticsUniq.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB namespace DB
{ {
@ -51,14 +52,15 @@ UInt64 StatisticsUniq::estimateCardinality() const
return column->getUInt(0); return column->getUInt(0);
} }
void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
{ {
data_type = removeNullable(data_type); data_type = removeNullable(data_type);
data_type = removeLowCardinalityAndNullable(data_type);
if (!data_type->isValueRepresentedByNumber()) if (!data_type->isValueRepresentedByNumber())
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName()); throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
} }
StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type) StatisticsPtr uniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
{ {
return std::make_shared<StatisticsUniq>(stat, data_type); return std::make_shared<StatisticsUniq>(stat, data_type);
} }

View File

@ -27,7 +27,7 @@ private:
}; };
void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type); void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type); StatisticsPtr uniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type);
} }

View File

@ -1,6 +1,10 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <Storages/Statistics/StatisticsTDigest.h> #include <Storages/Statistics/StatisticsTDigest.h>
#include <Interpreters/convertFieldToType.h>
#include <DataTypes/DataTypeFactory.h>
using namespace DB;
TEST(Statistics, TDigestLessThan) TEST(Statistics, TDigestLessThan)
{ {
@ -39,6 +43,4 @@ TEST(Statistics, TDigestLessThan)
std::reverse(data.begin(), data.end()); std::reverse(data.begin(), data.end());
test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001}); test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
} }

View File

@ -1,19 +1,14 @@
#include <Storages/StatisticsDescription.h> #include <Storages/StatisticsDescription.h>
#include <base/defines.h>
#include <Parsers/ASTExpressionList.h> #include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h> #include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h> #include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTStatisticsDeclaration.h> #include <Parsers/ASTStatisticsDeclaration.h>
#include <Parsers/formatAST.h>
#include <Parsers/parseQuery.h>
#include <Parsers/queryToString.h> #include <Parsers/queryToString.h>
#include <Parsers/ParserCreateQuery.h> #include <Parsers/ParserCreateQuery.h>
#include <Poco/Logger.h> #include <Poco/Logger.h>
#include <Storages/extractKeyExpressionList.h>
#include <Storages/ColumnsDescription.h> #include <Storages/ColumnsDescription.h>
#include <Common/logger_useful.h>
namespace DB namespace DB
{ {
@ -54,7 +49,9 @@ static StatisticsType stringToStatisticsType(String type)
return StatisticsType::TDigest; return StatisticsType::TDigest;
if (type == "uniq") if (type == "uniq")
return StatisticsType::Uniq; return StatisticsType::Uniq;
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); if (type == "count_min")
return StatisticsType::CountMinSketch;
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistics type: {}. Supported statistics types are 'tdigest', 'uniq' and 'count_min'.", type);
} }
String SingleStatisticsDescription::getTypeName() const String SingleStatisticsDescription::getTypeName() const
@ -65,8 +62,10 @@ String SingleStatisticsDescription::getTypeName() const
return "TDigest"; return "TDigest";
case StatisticsType::Uniq: case StatisticsType::Uniq:
return "Uniq"; return "Uniq";
case StatisticsType::CountMinSketch:
return "count_min";
default: default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistics type: {}. Supported statistics types are 'tdigest', 'uniq' and 'count_min'.", type);
} }
} }
@ -99,10 +98,9 @@ void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & othe
chassert(merging_column_type); chassert(merging_column_type);
if (column_name.empty()) if (column_name.empty())
{
column_name = merging_column_name; column_name = merging_column_name;
data_type = merging_column_type;
} data_type = merging_column_type;
for (const auto & [stats_type, stats_desc]: other.types_to_desc) for (const auto & [stats_type, stats_desc]: other.types_to_desc)
{ {
@ -121,6 +119,7 @@ void ColumnStatisticsDescription::assign(const ColumnStatisticsDescription & oth
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot assign statistics from column {} to {}", column_name, other.column_name); throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot assign statistics from column {} to {}", column_name, other.column_name);
types_to_desc = other.types_to_desc; types_to_desc = other.types_to_desc;
data_type = other.data_type;
} }
void ColumnStatisticsDescription::clear() void ColumnStatisticsDescription::clear()
@ -159,6 +158,7 @@ std::vector<ColumnStatisticsDescription> ColumnStatisticsDescription::fromAST(co
const auto & column = columns.getPhysical(physical_column_name); const auto & column = columns.getPhysical(physical_column_name);
stats.column_name = column.name; stats.column_name = column.name;
stats.data_type = column.type;
stats.types_to_desc = statistics_types; stats.types_to_desc = statistics_types;
result.push_back(stats); result.push_back(stats);
} }

View File

@ -13,6 +13,7 @@ enum class StatisticsType : UInt8
{ {
TDigest = 0, TDigest = 0,
Uniq = 1, Uniq = 1,
CountMinSketch = 2,
Max = 63, Max = 63,
}; };

View File

@ -43,7 +43,6 @@
#include <Parsers/parseQuery.h> #include <Parsers/parseQuery.h>
#include <Parsers/IAST.h> #include <Parsers/IAST.h>
#include <Analyzer/Utils.h>
#include <Analyzer/ColumnNode.h> #include <Analyzer/ColumnNode.h>
#include <Analyzer/FunctionNode.h> #include <Analyzer/FunctionNode.h>
#include <Analyzer/TableNode.h> #include <Analyzer/TableNode.h>
@ -61,26 +60,20 @@
#include <Interpreters/ClusterProxy/SelectStreamFactory.h> #include <Interpreters/ClusterProxy/SelectStreamFactory.h>
#include <Interpreters/ClusterProxy/executeQuery.h> #include <Interpreters/ClusterProxy/executeQuery.h>
#include <Interpreters/Cluster.h> #include <Interpreters/Cluster.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
#include <Interpreters/ExpressionAnalyzer.h> #include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/InterpreterSelectQuery.h> #include <Interpreters/InterpreterSelectQuery.h>
#include <Interpreters/InterpreterSelectQueryAnalyzer.h> #include <Interpreters/InterpreterSelectQueryAnalyzer.h>
#include <Interpreters/InterpreterInsertQuery.h> #include <Interpreters/InterpreterInsertQuery.h>
#include <Interpreters/JoinedTables.h> #include <Interpreters/JoinedTables.h>
#include <Interpreters/TranslateQualifiedNamesVisitor.h>
#include <Interpreters/AddDefaultDatabaseVisitor.h> #include <Interpreters/AddDefaultDatabaseVisitor.h>
#include <Interpreters/TreeRewriter.h> #include <Interpreters/TreeRewriter.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
#include <Interpreters/createBlockSelector.h> #include <Interpreters/createBlockSelector.h>
#include <Interpreters/evaluateConstantExpression.h> #include <Interpreters/evaluateConstantExpression.h>
#include <Interpreters/getClusterName.h> #include <Interpreters/getClusterName.h>
#include <Interpreters/getTableExpressions.h>
#include <Interpreters/RequiredSourceColumnsVisitor.h> #include <Interpreters/RequiredSourceColumnsVisitor.h>
#include <Interpreters/getCustomKeyFilterForParallelReplicas.h>
#include <Interpreters/getHeaderForProcessingStage.h> #include <Interpreters/getHeaderForProcessingStage.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <TableFunctions/TableFunctionView.h> #include <TableFunctions/TableFunctionView.h>
#include <TableFunctions/TableFunctionFactory.h> #include <TableFunctions/TableFunctionFactory.h>
@ -90,7 +83,6 @@
#include <Processors/Executors/PushingPipelineExecutor.h> #include <Processors/Executors/PushingPipelineExecutor.h>
#include <Processors/Executors/CompletedPipelineExecutor.h> #include <Processors/Executors/CompletedPipelineExecutor.h>
#include <Processors/QueryPlan/QueryPlan.h> #include <Processors/QueryPlan/QueryPlan.h>
#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
#include <Processors/QueryPlan/ReadFromPreparedSource.h> #include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Processors/QueryPlan/ExpressionStep.h> #include <Processors/QueryPlan/ExpressionStep.h>
#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h> #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
@ -496,7 +488,7 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(
} }
std::optional<QueryProcessingStage::Enum> optimized_stage; std::optional<QueryProcessingStage::Enum> optimized_stage;
if (settings.allow_experimental_analyzer) if (query_info.query_tree)
optimized_stage = getOptimizedQueryProcessingStageAnalyzer(query_info, settings); optimized_stage = getOptimizedQueryProcessingStageAnalyzer(query_info, settings);
else else
optimized_stage = getOptimizedQueryProcessingStage(query_info, settings); optimized_stage = getOptimizedQueryProcessingStage(query_info, settings);
@ -860,31 +852,28 @@ void StorageDistributed::read(
modified_query_info.query = queryNodeToDistributedSelectQuery(query_tree_distributed); modified_query_info.query = queryNodeToDistributedSelectQuery(query_tree_distributed);
modified_query_info.query_tree = std::move(query_tree_distributed); modified_query_info.query_tree = std::move(query_tree_distributed);
/// Return directly (with correct header) if no shard to query.
if (modified_query_info.getCluster()->getShardsInfo().empty())
return;
} }
else else
{ {
header = InterpreterSelectQuery(modified_query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); header = InterpreterSelectQuery(modified_query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock();
}
if (!settings.allow_experimental_analyzer)
{
modified_query_info.query = ClusterProxy::rewriteSelectQuery( modified_query_info.query = ClusterProxy::rewriteSelectQuery(
local_context, modified_query_info.query, local_context, modified_query_info.query,
remote_database, remote_table, remote_table_function_ptr); remote_database, remote_table, remote_table_function_ptr);
}
/// Return directly (with correct header) if no shard to query. if (modified_query_info.getCluster()->getShardsInfo().empty())
if (modified_query_info.getCluster()->getShardsInfo().empty()) {
{ Pipe pipe(std::make_shared<NullSource>(header));
if (settings.allow_experimental_analyzer) auto read_from_pipe = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
read_from_pipe->setStepDescription("Read from NullSource (Distributed)");
query_plan.addStep(std::move(read_from_pipe));
return; return;
}
Pipe pipe(std::make_shared<NullSource>(header));
auto read_from_pipe = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
read_from_pipe->setStepDescription("Read from NullSource (Distributed)");
query_plan.addStep(std::move(read_from_pipe));
return;
} }
const auto & snapshot_data = assert_cast<const SnapshotData &>(*storage_snapshot->data); const auto & snapshot_data = assert_cast<const SnapshotData &>(*storage_snapshot->data);

View File

@ -427,7 +427,9 @@ namespace
{ {
for (const auto & path : paths) for (const auto & path : paths)
{ {
if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path)) auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path);
/// Use this format only if we have a schema reader for it.
if (format_from_path && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_path))
{ {
format = format_from_path; format = format_from_path;
break; break;
@ -716,7 +718,9 @@ namespace
/// If format is unknown we can try to determine it by the file name. /// If format is unknown we can try to determine it by the file name.
if (!format) if (!format)
{ {
if (auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename)) auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename);
/// Use this format only if we have a schema reader for it.
if (format_from_file && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file))
format = format_from_file; format = format_from_file;
} }

View File

@ -505,18 +505,18 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, Context
additional_info = fmt::format(" (TID: {}; TIDH: {})", current_tid, current_tid.getHash()); additional_info = fmt::format(" (TID: {}; TIDH: {})", current_tid, current_tid.getHash());
} }
Int64 version; MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings());
Int64 version = increment.get();
entry.commit(version);
String mutation_id = entry.file_name;
if (txn)
txn->addMutation(shared_from_this(), mutation_id);
bool alter_conversions_mutations_updated = updateAlterConversionsMutations(entry.commands, alter_conversions_mutations, /* remove= */ false);
{ {
std::lock_guard lock(currently_processing_in_background_mutex); std::lock_guard lock(currently_processing_in_background_mutex);
MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings());
version = increment.get();
entry.commit(version);
String mutation_id = entry.file_name;
if (txn)
txn->addMutation(shared_from_this(), mutation_id);
bool alter_conversions_mutations_updated = updateAlterConversionsMutations(entry.commands, alter_conversions_mutations, /* remove= */ false);
bool inserted = current_mutations_by_version.try_emplace(version, std::move(entry)).second; bool inserted = current_mutations_by_version.try_emplace(version, std::move(entry)).second;
if (!inserted) if (!inserted)
{ {
@ -527,9 +527,9 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, Context
} }
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutation {} already exists, it's a bug", version); throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutation {} already exists, it's a bug", version);
} }
LOG_INFO(log, "Added mutation: {}{}", mutation_id, additional_info);
} }
LOG_INFO(log, "Added mutation: {}{}", mutation_id, additional_info);
background_operations_assignee.trigger(); background_operations_assignee.trigger();
return version; return version;
} }

View File

@ -737,7 +737,9 @@ namespace
{ {
for (const auto & url : options) for (const auto & url : options)
{ {
if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url)) auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url);
/// Use this format only if we have a schema reader for it.
if (format_from_file_name && FormatFactory::instance().checkIfFormatHasAnySchemaReader(*format_from_file_name))
{ {
format = format_from_file_name; format = format_from_file_name;
break; break;

View File

@ -172,7 +172,7 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType
/// with subqueries it's possible that new analyzer will be enabled in ::read method /// with subqueries it's possible that new analyzer will be enabled in ::read method
/// of underlying storage when all other parts of infra are not ready for it /// of underlying storage when all other parts of infra are not ready for it
/// (built with old analyzer). /// (built with old analyzer).
context_copy->setSetting("allow_experimental_analyzer", Field{0}); context_copy->setSetting("allow_experimental_analyzer", false);
auto syntax_analyzer_result = TreeRewriter(context_copy).analyze(ast, columns); auto syntax_analyzer_result = TreeRewriter(context_copy).analyze(ast, columns);
ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context_copy); ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context_copy);
auto dag = analyzer.getActionsDAG(false); auto dag = analyzer.getActionsDAG(false);

View File

@ -104,6 +104,9 @@ class GHActions:
res = json.load(json_file) res = json.load(json_file)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
print(f"ERROR: json decoder exception {e}") print(f"ERROR: json decoder exception {e}")
json_file.seek(0)
print(" File content:")
print(json_file.read())
return {} return {}
return res return res

View File

@ -21,7 +21,7 @@ from env_helper import (
TEMP_PATH, TEMP_PATH,
) )
from git_helper import Git from git_helper import Git
from pr_info import PRInfo, EventType from pr_info import PRInfo
from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults
from stopwatch import Stopwatch from stopwatch import Stopwatch
from tee_popen import TeePopen from tee_popen import TeePopen
@ -375,25 +375,23 @@ def main():
tags = gen_tags(args.version, args.release_type) tags = gen_tags(args.version, args.release_type)
repo_urls = {} repo_urls = {}
direct_urls: Dict[str, List[str]] = {} direct_urls: Dict[str, List[str]] = {}
if pr_info.event_type == EventType.PULL_REQUEST:
release_or_pr = str(pr_info.number)
sha = pr_info.sha
elif pr_info.event_type == EventType.PUSH and pr_info.is_master:
release_or_pr = str(0)
sha = pr_info.sha
else:
release_or_pr = f"{args.version.major}.{args.version.minor}"
sha = args.sha
assert sha
for arch, build_name in zip(ARCH, ("package_release", "package_aarch64")): for arch, build_name in zip(ARCH, ("package_release", "package_aarch64")):
if not args.bucket_prefix: if args.bucket_prefix:
assert not args.allow_build_reuse
repo_urls[arch] = f"{args.bucket_prefix}/{build_name}"
elif args.sha:
# CreateRelease workflow only. TODO
version = args.version
repo_urls[arch] = ( repo_urls[arch] = (
f"{S3_DOWNLOAD}/{S3_BUILDS_BUCKET}/" f"{S3_DOWNLOAD}/{S3_BUILDS_BUCKET}/"
f"{release_or_pr}/{sha}/{build_name}" f"{version.major}.{version.minor}/{args.sha}/{build_name}"
) )
else: else:
repo_urls[arch] = f"{args.bucket_prefix}/{build_name}" # In all other cases urls must be fetched from build reports. TODO: script needs refactoring
repo_urls[arch] = ""
assert args.allow_build_reuse
if args.allow_build_reuse: if args.allow_build_reuse:
# read s3 urls from pre-downloaded build reports # read s3 urls from pre-downloaded build reports
if "clickhouse-server" in image_repo: if "clickhouse-server" in image_repo:
@ -431,7 +429,6 @@ def main():
) )
if test_results[-1].status != "OK": if test_results[-1].status != "OK":
status = FAILURE status = FAILURE
pr_info = pr_info or PRInfo()
description = f"Processed tags: {', '.join(tags)}" description = f"Processed tags: {', '.join(tags)}"
JobReport( JobReport(

View File

@ -52,6 +52,7 @@ from helpers.client import QueryRuntimeException
import docker import docker
from .client import Client from .client import Client
from .retry_decorator import retry
from .config_cluster import * from .config_cluster import *
@ -2690,15 +2691,12 @@ class ClickHouseCluster:
images_pull_cmd = self.base_cmd + ["pull"] images_pull_cmd = self.base_cmd + ["pull"]
# sometimes dockerhub/proxy can be flaky # sometimes dockerhub/proxy can be flaky
for i in range(5):
try: retry(
run_and_check(images_pull_cmd) log_function=lambda exception: logging.info(
break "Got exception pulling images: %s", exception
except Exception as ex: ),
if i == 4: )(run_and_check)(images_pull_cmd)
raise ex
logging.info("Got exception pulling images: %s", ex)
time.sleep(i * 3)
if self.with_zookeeper_secure and self.base_zookeeper_cmd: if self.with_zookeeper_secure and self.base_zookeeper_cmd:
logging.debug("Setup ZooKeeper Secure") logging.debug("Setup ZooKeeper Secure")
@ -2971,7 +2969,11 @@ class ClickHouseCluster:
"Trying to create Azurite instance by command %s", "Trying to create Azurite instance by command %s",
" ".join(map(str, azurite_start_cmd)), " ".join(map(str, azurite_start_cmd)),
) )
run_and_check(azurite_start_cmd) retry(
log_function=lambda exception: logging.info(
f"Azurite initialization failed with error: {exception}"
),
)(run_and_check)(azurite_start_cmd)
self.up_called = True self.up_called = True
logging.info("Trying to connect to Azurite") logging.info("Trying to connect to Azurite")
self.wait_azurite_to_start() self.wait_azurite_to_start()

View File

@ -0,0 +1,36 @@
import time
import random
from typing import Type, List
def retry(
retries: int = 5,
delay: float = 1,
backoff: float = 1.5,
jitter: float = 2,
log_function=lambda *args, **kwargs: None,
retriable_expections_list: List[Type[BaseException]] = [Exception],
):
def inner(func):
def wrapper(*args, **kwargs):
current_delay = delay
for retry in range(retries):
try:
func(*args, **kwargs)
break
except Exception as e:
should_retry = False
for retriable_exception in retriable_expections_list:
if isinstance(e, retriable_exception):
should_retry = True
break
if not should_retry or (retry == retries - 1):
raise e
log_function(retry=retry, exception=e)
sleep_time = current_delay + random.uniform(0, jitter)
time.sleep(sleep_time)
current_delay *= backoff
return wrapper
return inner

View File

@ -1,7 +1,7 @@
<clickhouse> <clickhouse>
<query_cache> <query_cache>
<max_entries>1</max_entries> <max_entries>0</max_entries>
</query_cache> </query_cache>
</clickhouse> </clickhouse>

View File

@ -94,54 +94,61 @@ CONFIG_DIR = os.path.join(SCRIPT_DIR, "configs")
def test_query_cache_size_is_runtime_configurable(start_cluster): def test_query_cache_size_is_runtime_configurable(start_cluster):
# the initial config specifies the maximum query cache size as 2, run 3 queries, expect 2 cache entries
node.query("SYSTEM DROP QUERY CACHE") node.query("SYSTEM DROP QUERY CACHE")
# The initial config allows at most two query cache entries but we don't mind
node.query("SELECT 1 SETTINGS use_query_cache = 1, query_cache_ttl = 1") node.query("SELECT 1 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
time.sleep(2) time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS") # At this point, the query cache contains one entry and it is stale
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
)
assert res == "2\n"
# switch to a config with a maximum query cache size of 1 res = node.query(
"SELECT count(*) FROM system.query_cache",
)
assert res == "1\n"
# switch to a config with a maximum query cache size of _0_
node.copy_file_to_container( node.copy_file_to_container(
os.path.join(CONFIG_DIR, "smaller_query_cache.xml"), os.path.join(CONFIG_DIR, "empty_query_cache.xml"),
"/etc/clickhouse-server/config.d/default.xml", "/etc/clickhouse-server/config.d/default.xml",
) )
node.query("SYSTEM RELOAD CONFIG") node.query("SYSTEM RELOAD CONFIG")
# check that eviction worked as expected
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query( res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'", "SELECT count(*) FROM system.query_cache",
)
assert (
res == "2\n"
) # "Why not 1?", you think. Reason is that QC uses the TTLCachePolicy that evicts lazily only upon insert.
# Not a real issue, can be changed later, at least there's a test now.
# Also, you may also wonder "why query_cache_ttl = 1"? Reason is that TTLCachePolicy only removes *stale* entries. With the default TTL
# (60 sec), no entries would be removed at all. Again: not a real issue, can be changed later and there's at least a test now.
# check that the new query cache maximum size is respected when more queries run
node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 5 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
) )
assert res == "1\n" assert res == "1\n"
# "Why not 0?", I hear you say. Reason is that QC uses the TTLCachePolicy that evicts lazily only upon insert.
# Not a real issue, can be changed later, at least there's a test now.
# restore the original config # The next SELECT will find a single stale entry which is one entry too much according to the new config.
# This triggers the eviction of all stale entries, in this case the 'SELECT 1' result.
# Then, it tries to insert the 'SELECT 2' result but it also cannot be added according to the config.
node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query(
"SELECT count(*) FROM system.query_cache",
)
assert res == "0\n"
# The new maximum cache size is respected when more queries run
node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query(
"SELECT count(*) FROM system.query_cache",
)
assert res == "0\n"
# Restore the original config
node.copy_file_to_container( node.copy_file_to_container(
os.path.join(CONFIG_DIR, "default.xml"), os.path.join(CONFIG_DIR, "default.xml"),
"/etc/clickhouse-server/config.d/default.xml", "/etc/clickhouse-server/config.d/default.xml",
) )
node.query("SYSTEM RELOAD CONFIG")
# It is possible to insert entries again
node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query(
"SELECT count(*) FROM system.query_cache",
)
assert res == "1\n"

View File

@ -154,6 +154,7 @@ def test_put(started_cluster, maybe_auth, positive, compression):
def test_partition_by(started_cluster): def test_partition_by(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32" table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
@ -161,26 +162,37 @@ def test_partition_by(started_cluster):
values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
filename = "test_{_partition_id}.csv" filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}') s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}""" PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query) run_query(instance, put_query)
assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test_3.csv") assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_3.csv")
assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test_1.csv") assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_1.csv")
assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test_45.csv") assert "78,43,45\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test_45.csv"
)
filename = "test2_{_partition_id}.csv" filename = "test2_{_partition_id}.csv"
instance.query( instance.query(
f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV') partition by column3" f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV') partition by column3"
) )
instance.query(f"insert into p values {values}") instance.query(f"insert into p values {values}")
assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test2_3.csv") assert "1,2,3\n" == get_s3_file_content(
assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test2_1.csv") started_cluster, bucket, f"{id}/test2_3.csv"
assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv") )
assert "3,2,1\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test2_1.csv"
)
assert "78,43,45\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test2_45.csv"
)
instance.query("drop table p")
def test_partition_by_string_column(started_cluster): def test_partition_by_string_column(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "col_num UInt32, col_str String" table_format = "col_num UInt32, col_str String"
@ -188,21 +200,24 @@ def test_partition_by_string_column(started_cluster):
values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')" values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
filename = "test_{_partition_id}.csv" filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}') s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}""" PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query) run_query(instance, put_query)
assert '1,"foo/bar"\n' == get_s3_file_content( assert '1,"foo/bar"\n' == get_s3_file_content(
started_cluster, bucket, "test_foo/bar.csv" started_cluster, bucket, f"{id}/test_foo/bar.csv"
)
assert '3,"йцук"\n' == get_s3_file_content(
started_cluster, bucket, f"{id}/test_йцук.csv"
) )
assert '3,"йцук"\n' == get_s3_file_content(started_cluster, bucket, "test_йцук.csv")
assert '78,"你好"\n' == get_s3_file_content( assert '78,"你好"\n' == get_s3_file_content(
started_cluster, bucket, "test_你好.csv" started_cluster, bucket, f"{id}/test_你好.csv"
) )
def test_partition_by_const_column(started_cluster): def test_partition_by_const_column(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32" table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
@ -211,12 +226,14 @@ def test_partition_by_const_column(started_cluster):
values_csv = "1,2,3\n3,2,1\n78,43,45\n" values_csv = "1,2,3\n3,2,1\n78,43,45\n"
filename = "test_{_partition_id}.csv" filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}') s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}""" PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query) run_query(instance, put_query)
assert values_csv == get_s3_file_content(started_cluster, bucket, "test_88.csv") assert values_csv == get_s3_file_content(
started_cluster, bucket, f"{id}/test_88.csv"
)
@pytest.mark.parametrize("special", ["space", "plus"]) @pytest.mark.parametrize("special", ["space", "plus"])
@ -276,46 +293,31 @@ def test_get_path_with_special(started_cluster, special):
@pytest.mark.parametrize("auth", [pytest.param("'minio','minio123',", id="minio")]) @pytest.mark.parametrize("auth", [pytest.param("'minio','minio123',", id="minio")])
def test_empty_put(started_cluster, auth): def test_empty_put(started_cluster, auth):
# type: (ClickHouseCluster, str) -> None # type: (ClickHouseCluster, str) -> None
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32" table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
drop_empty_table_query = "DROP TABLE IF EXISTS empty_table" drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
create_empty_table_query = """ create_empty_table_query = (
CREATE TABLE empty_table ( f"CREATE TABLE empty_table ({table_format}) ENGINE = Null()"
{}
) ENGINE = Null()
""".format(
table_format
) )
run_query(instance, drop_empty_table_query) run_query(instance, drop_empty_table_query)
run_query(instance, create_empty_table_query) run_query(instance, create_empty_table_query)
filename = "empty_put_test.csv" filename = "empty_put_test.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') select * from empty_table".format( put_query = f"""insert into table function
started_cluster.minio_ip, s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')
MINIO_INTERNAL_PORT, select * from empty_table"""
bucket,
filename,
auth,
table_format,
)
run_query(instance, put_query) run_query(instance, put_query)
assert ( assert (
run_query( run_query(
instance, instance,
"select count(*) from s3('http://{}:{}/{}/{}', {}'CSV', '{}')".format( f"""select count(*) from
started_cluster.minio_ip, s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')""",
MINIO_INTERNAL_PORT,
bucket,
filename,
auth,
table_format,
),
) )
== "0\n" == "0\n"
) )
@ -499,6 +501,7 @@ def test_put_get_with_globs(started_cluster):
def test_multipart(started_cluster, maybe_auth, positive): def test_multipart(started_cluster, maybe_auth, positive):
# type: (ClickHouseCluster, str, bool) -> None # type: (ClickHouseCluster, str, bool) -> None
id = uuid.uuid4()
bucket = ( bucket = (
started_cluster.minio_bucket started_cluster.minio_bucket
if not maybe_auth if not maybe_auth
@ -521,7 +524,7 @@ def test_multipart(started_cluster, maybe_auth, positive):
assert len(csv_data) > min_part_size_bytes assert len(csv_data) > min_part_size_bytes
filename = "test_multipart.csv" filename = f"{id}/test_multipart.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format( put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
started_cluster.minio_redirect_host, started_cluster.minio_redirect_host,
started_cluster.minio_redirect_port, started_cluster.minio_redirect_port,
@ -693,7 +696,7 @@ def test_s3_glob_many_objects_under_selection(started_cluster):
def create_files(thread_num): def create_files(thread_num):
for f_num in range(thread_num * 63, thread_num * 63 + 63): for f_num in range(thread_num * 63, thread_num * 63 + 63):
path = f"folder1/file{f_num}.csv" path = f"folder1/file{f_num}.csv"
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
started_cluster.minio_ip, started_cluster.minio_ip,
MINIO_INTERNAL_PORT, MINIO_INTERNAL_PORT,
bucket, bucket,
@ -706,7 +709,7 @@ def test_s3_glob_many_objects_under_selection(started_cluster):
jobs.append(threading.Thread(target=create_files, args=(thread_num,))) jobs.append(threading.Thread(target=create_files, args=(thread_num,)))
jobs[-1].start() jobs[-1].start()
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
started_cluster.minio_ip, started_cluster.minio_ip,
MINIO_INTERNAL_PORT, MINIO_INTERNAL_PORT,
bucket, bucket,
@ -881,7 +884,7 @@ def test_storage_s3_get_unstable(started_cluster):
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') FORMAT CSV" get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') SETTINGS s3_max_single_read_retries=30 FORMAT CSV"
result = run_query(instance, get_query) result = run_query(instance, get_query)
assert result.splitlines() == ["500001,500000,0"] assert result.splitlines() == ["500001,500000,0"]
@ -896,9 +899,10 @@ def test_storage_s3_get_slow(started_cluster):
def test_storage_s3_put_uncompressed(started_cluster): def test_storage_s3_put_uncompressed(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
filename = "test_put_uncompressed.bin" filename = f"{id}/test_put_uncompressed.bin"
name = "test_put_uncompressed" name = "test_put_uncompressed"
data = [ data = [
"'Gloria Thompson',99", "'Gloria Thompson',99",
@ -950,6 +954,7 @@ def test_storage_s3_put_uncompressed(started_cluster):
r = result.strip().split("\t") r = result.strip().split("\t")
assert int(r[0]) >= 1, blob_storage_log assert int(r[0]) >= 1, blob_storage_log
assert all(col == r[0] for col in r), blob_storage_log assert all(col == r[0] for col in r), blob_storage_log
run_query(instance, f"DROP TABLE {name}")
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -957,9 +962,10 @@ def test_storage_s3_put_uncompressed(started_cluster):
[pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")], [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
) )
def test_storage_s3_put_gzip(started_cluster, extension, method): def test_storage_s3_put_gzip(started_cluster, extension, method):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
filename = f"test_put_gzip.{extension}" filename = f"{id}/test_put_gzip.{extension}"
name = f"test_put_gzip_{extension}" name = f"test_put_gzip_{extension}"
data = [ data = [
"'Joseph Tomlinson',5", "'Joseph Tomlinson',5",
@ -996,6 +1002,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method):
f = gzip.GzipFile(fileobj=buf, mode="rb") f = gzip.GzipFile(fileobj=buf, mode="rb")
uncompressed_content = f.read().decode() uncompressed_content = f.read().decode()
assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 708 assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 708
run_query(instance, f"DROP TABLE {name}")
def test_truncate_table(started_cluster): def test_truncate_table(started_cluster):
@ -1021,14 +1028,24 @@ def test_truncate_table(started_cluster):
len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/")))
== 0 == 0
): ):
return break
timeout -= 1 timeout -= 1
time.sleep(1) time.sleep(1)
assert len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) == 0 assert len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) == 0
assert instance.query("SELECT * FROM {}".format(name)) == "" # FIXME: there was a bug in test and it was never checked.
# Currently read from truncated table fails with
# DB::Exception: Failed to get object info: No response body..
# HTTP response code: 404: while reading truncate: While executing S3Source
# assert instance.query("SELECT * FROM {}".format(name)) == ""
instance.query(f"DROP TABLE {name} SYNC")
assert (
instance.query(f"SELECT count() FROM system.tables where name='{name}'")
== "0\n"
)
def test_predefined_connection_configuration(started_cluster): def test_predefined_connection_configuration(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances[ instance = started_cluster.instances[
"dummy_without_named_collections" "dummy_without_named_collections"
@ -1056,7 +1073,9 @@ def test_predefined_connection_configuration(started_cluster):
user="user", user="user",
) )
instance.query(f"INSERT INTO {name} SELECT number FROM numbers(10)") instance.query(
f"INSERT INTO {name} SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1"
)
result = instance.query(f"SELECT * FROM {name}") result = instance.query(f"SELECT * FROM {name}")
assert result == instance.query("SELECT number FROM numbers(10)") assert result == instance.query("SELECT number FROM numbers(10)")
@ -1070,9 +1089,11 @@ def test_predefined_connection_configuration(started_cluster):
"To execute this query, it's necessary to have the grant NAMED COLLECTION ON no_collection" "To execute this query, it's necessary to have the grant NAMED COLLECTION ON no_collection"
in error in error
) )
instance = started_cluster.instances["dummy"] # has named collection access instance2 = started_cluster.instances["dummy"] # has named collection access
error = instance.query_and_get_error("SELECT * FROM s3(no_collection)") error = instance2.query_and_get_error("SELECT * FROM s3(no_collection)")
assert "There is no named collection `no_collection`" in error assert "There is no named collection `no_collection`" in error
instance.query("DROP USER user")
instance.query(f"DROP TABLE {name}")
result = "" result = ""
@ -1222,7 +1243,7 @@ def test_s3_schema_inference(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)" f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000) SETTINGS s3_truncate_on_insert=1"
) )
result = instance.query(f"desc s3(s3_native, format='Native')") result = instance.query(f"desc s3(s3_native, format='Native')")
assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
@ -1262,6 +1283,9 @@ def test_s3_schema_inference(started_cluster):
result = instance.query(f"select count(*) from {table_function}") result = instance.query(f"select count(*) from {table_function}")
assert int(result) == 5000000 assert int(result) == 5000000
instance.query("drop table schema_inference")
instance.query("drop table schema_inference_2")
def test_empty_file(started_cluster): def test_empty_file(started_cluster):
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
@ -1297,6 +1321,7 @@ def test_overwrite(started_cluster):
result = instance.query(f"select count() from test_overwrite") result = instance.query(f"select count() from test_overwrite")
assert int(result) == 200 assert int(result) == 200
instance.query(f"drop table test_overwrite")
def test_create_new_files_on_insert(started_cluster): def test_create_new_files_on_insert(started_cluster):
@ -1338,6 +1363,7 @@ def test_create_new_files_on_insert(started_cluster):
result = instance.query(f"select count() from test_multiple_inserts") result = instance.query(f"select count() from test_multiple_inserts")
assert int(result) == 60 assert int(result) == 60
instance.query("drop table test_multiple_inserts")
def test_format_detection(started_cluster): def test_format_detection(started_cluster):
@ -1345,7 +1371,9 @@ def test_format_detection(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query(f"create table arrow_table_s3 (x UInt64) engine=S3(s3_arrow)") instance.query(f"create table arrow_table_s3 (x UInt64) engine=S3(s3_arrow)")
instance.query(f"insert into arrow_table_s3 select 1") instance.query(
f"insert into arrow_table_s3 select 1 settings s3_truncate_on_insert=1"
)
result = instance.query(f"select * from s3(s3_arrow)") result = instance.query(f"select * from s3(s3_arrow)")
assert int(result) == 1 assert int(result) == 1
@ -1360,7 +1388,9 @@ def test_format_detection(started_cluster):
assert int(result) == 1 assert int(result) == 1
instance.query(f"create table parquet_table_s3 (x UInt64) engine=S3(s3_parquet2)") instance.query(f"create table parquet_table_s3 (x UInt64) engine=S3(s3_parquet2)")
instance.query(f"insert into parquet_table_s3 select 1") instance.query(
f"insert into parquet_table_s3 select 1 settings s3_truncate_on_insert=1"
)
result = instance.query(f"select * from s3(s3_parquet2)") result = instance.query(f"select * from s3(s3_parquet2)")
assert int(result) == 1 assert int(result) == 1
@ -1373,64 +1403,67 @@ def test_format_detection(started_cluster):
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')" f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
) )
assert int(result) == 1 assert int(result) == 1
instance.query(f"drop table arrow_table_s3")
instance.query(f"drop table parquet_table_s3")
def test_schema_inference_from_globs(started_cluster): def test_schema_inference_from_globs(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
) )
instance.query( instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0" f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0"
) )
url_filename = "test{1,2}.jsoncompacteachrow" url_filename = "test{1,2}.jsoncompacteachrow"
result = instance.query( result = instance.query(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0" f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
) )
assert result.strip() == "c1\tNullable(Int64)" assert result.strip() == "c1\tNullable(Int64)"
result = instance.query( result = instance.query(
f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0" f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
) )
assert sorted(result.split()) == ["0", "\\N"] assert sorted(result.split()) == ["0", "\\N"]
result = instance.query( result = instance.query(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0" f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
) )
assert result.strip() == "c1\tNullable(Int64)" assert result.strip() == "c1\tNullable(Int64)"
result = instance.query( result = instance.query(
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0" f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
) )
assert sorted(result.split()) == ["0", "\\N"] assert sorted(result.split()) == ["0", "\\N"]
instance.query( instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
) )
url_filename = "test{1,3}.jsoncompacteachrow" url_filename = "test{1,3}.jsoncompacteachrow"
result = instance.query_and_get_error( result = instance.query_and_get_error(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0" f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
) )
assert "All attempts to extract table structure from files failed" in result assert "All attempts to extract table structure from files failed" in result
result = instance.query_and_get_error( result = instance.query_and_get_error(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0" f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
) )
assert "All attempts to extract table structure from files failed" in result assert "All attempts to extract table structure from files failed" in result
instance.query( instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'" f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'"
) )
result = instance.query_and_get_error( result = instance.query_and_get_error(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0" f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
) )
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
@ -1438,7 +1471,7 @@ def test_schema_inference_from_globs(started_cluster):
url_filename = "test{0,1,2,3}.jsoncompacteachrow" url_filename = "test{0,1,2,3}.jsoncompacteachrow"
result = instance.query_and_get_error( result = instance.query_and_get_error(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0" f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
) )
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
@ -1498,9 +1531,12 @@ def test_signatures(started_cluster):
) )
assert "S3_ERROR" in error assert "S3_ERROR" in error
instance.query(f"drop table test_signatures")
def test_select_columns(started_cluster): def test_select_columns(started_cluster):
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
id = uuid.uuid4()
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
name = "test_table2" name = "test_table2"
structure = "id UInt32, value1 Int32, value2 Int32" structure = "id UInt32, value1 Int32, value2 Int32"
@ -1514,36 +1550,37 @@ def test_select_columns(started_cluster):
instance.query( instance.query(
f"INSERT INTO {name} SELECT * FROM generateRandom('{structure}') LIMIT {limit} SETTINGS s3_truncate_on_insert=1" f"INSERT INTO {name} SELECT * FROM generateRandom('{structure}') LIMIT {limit} SETTINGS s3_truncate_on_insert=1"
) )
instance.query(f"SELECT value2 FROM {name}") instance.query(f"SELECT value2, '{id}' FROM {name}")
instance.query("SYSTEM FLUSH LOGS") instance.query("SYSTEM FLUSH LOGS")
result1 = instance.query( result1 = instance.query(
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2 FROM {name}'" f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2, ''{id}'' FROM {name}'"
) )
instance.query(f"SELECT * FROM {name}") instance.query(f"SELECT *, '{id}' FROM {name}")
instance.query("SYSTEM FLUSH LOGS") instance.query("SYSTEM FLUSH LOGS")
result2 = instance.query( result2 = instance.query(
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT * FROM {name}'" f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT *, ''{id}'' FROM {name}'"
) )
assert round(int(result2) / int(result1)) == 3 assert round(int(result2) / int(result1)) == 3
def test_insert_select_schema_inference(started_cluster): def test_insert_select_schema_inference(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native') select toUInt64(1) as x" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native') select toUInt64(1) as x"
) )
result = instance.query( result = instance.query(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native')" f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
) )
assert result.strip() == "x\tUInt64" assert result.strip() == "x\tUInt64"
result = instance.query( result = instance.query(
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native')" f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
) )
assert int(result) == 1 assert int(result) == 1
@ -1553,7 +1590,7 @@ def test_parallel_reading_with_memory_limit(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000)" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000) SETTINGS s3_truncate_on_insert=1"
) )
result = instance.query_and_get_error( result = instance.query_and_get_error(
@ -1574,7 +1611,7 @@ def test_wrong_format_usage(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6)" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6) SETTINGS s3_truncate_on_insert=1"
) )
# size(test_wrong_format.native) = 10e6*8+16(header) ~= 76MiB # size(test_wrong_format.native) = 10e6*8+16(header) ~= 76MiB
@ -2097,11 +2134,11 @@ def test_read_subcolumns(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
) )
res = instance.query( res = instance.query(
@ -2160,7 +2197,7 @@ def test_read_subcolumn_time(started_cluster):
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42) SETTINGS s3_truncate_on_insert=1"
) )
res = instance.query( res = instance.query(
@ -2171,29 +2208,30 @@ def test_read_subcolumn_time(started_cluster):
def test_filtering_by_file_or_path(started_cluster): def test_filtering_by_file_or_path(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] instance = started_cluster.instances["dummy"]
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1 SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2 SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3 SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'" f"select count(), '{id}' from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
) )
instance.query("SYSTEM FLUSH LOGS") instance.query("SYSTEM FLUSH LOGS")
result = instance.query( result = instance.query(
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%select%s3%test_filter%' AND type='QueryFinish'" f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%{id}%' AND type='QueryFinish'"
) )
assert int(result) == 1 assert int(result) == 1
@ -2206,54 +2244,56 @@ def test_filtering_by_file_or_path(started_cluster):
def test_union_schema_inference_mode(started_cluster): def test_union_schema_inference_mode(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket bucket = started_cluster.minio_bucket
instance = started_cluster.instances["s3_non_default"] instance = started_cluster.instances["s3_non_default"]
file_name_prefix = f"test_union_schema_inference_{id}_"
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference1.jsonl') select 1 as a" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}1.jsonl') select 1 as a SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference2.jsonl') select 2 as b" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') select 2 as b SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference3.jsonl') select 2 as c" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}3.jsonl') select 2 as c SETTINGS s3_truncate_on_insert=1"
) )
instance.query( instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference4.jsonl', TSV) select 'Error'" f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}4.jsonl', TSV) select 'Error' SETTINGS s3_truncate_on_insert=1"
) )
for engine in ["s3", "url"]: for engine in ["s3", "url"]:
instance.query("system drop schema cache for s3") instance.query("system drop schema cache for s3")
result = instance.query( result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\nc\tNullable(Int64)\n" assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\nc\tNullable(Int64)\n"
result = instance.query( result = instance.query(
"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%test_union_schema_inference%' order by file format TSV" f"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%{file_name_prefix}%' order by file format TSV"
) )
assert ( assert (
result == "UNION\ttest_union_schema_inference1.jsonl\ta Nullable(Int64)\n" result == f"UNION\t{file_name_prefix}1.jsonl\ta Nullable(Int64)\n"
"UNION\ttest_union_schema_inference2.jsonl\tb Nullable(Int64)\n" f"UNION\t{file_name_prefix}2.jsonl\tb Nullable(Int64)\n"
"UNION\ttest_union_schema_inference3.jsonl\tc Nullable(Int64)\n" f"UNION\t{file_name_prefix}3.jsonl\tc Nullable(Int64)\n"
) )
result = instance.query( result = instance.query(
f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "1\t\\N\t\\N\n" "\\N\t2\t\\N\n" "\\N\t\\N\t2\n" assert result == "1\t\\N\t\\N\n" "\\N\t2\t\\N\n" "\\N\t\\N\t2\n"
instance.query(f"system drop schema cache for {engine}") instance.query(f"system drop schema cache for {engine}")
result = instance.query( result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "b\tNullable(Int64)\n" assert result == "b\tNullable(Int64)\n"
result = instance.query( result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert ( assert (
result == "a\tNullable(Int64)\n" result == "a\tNullable(Int64)\n"
@ -2262,7 +2302,7 @@ def test_union_schema_inference_mode(started_cluster):
) )
error = instance.query_and_get_error( error = instance.query_and_get_error(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error

View File

@ -7,7 +7,7 @@
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('d Date, dt DateTime, dtm DateTime(\'Asia/Istanbul\')', 0, 10, 10) LIMIT 1000000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('d Date, dt DateTime, dtm DateTime(\'Asia/Istanbul\')', 0, 10, 10) LIMIT 1000000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('dt64 DateTime64, dts64 DateTime64(6), dtms64 DateTime64(6 ,\'Asia/Istanbul\')', 0, 10, 10) LIMIT 100000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('dt64 DateTime64, dts64 DateTime64(6), dtms64 DateTime64(6 ,\'Asia/Istanbul\')', 0, 10, 10) LIMIT 100000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('f32 Float32, f64 Float64', 0, 10, 10) LIMIT 1000000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('f32 Float32, f64 Float64', 0, 10, 10) LIMIT 1000000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('d32 Decimal32(4), d64 Decimal64(8), d128 Decimal128(16)', 0, 10, 10) LIMIT 1000000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('d32 Decimal32(4), d64 Decimal64(8), d128 Decimal128(16)', 0, 10, 10) LIMIT 100000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Tuple(Int32, Int64)', 0, 10, 10) LIMIT 1000000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Tuple(Int32, Int64)', 0, 10, 10) LIMIT 1000000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Array(Int8)', 0, 10, 10) LIMIT 100000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Array(Int8)', 0, 10, 10) LIMIT 100000000);</query>
<query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Array(Nullable(Int32))', 0, 10, 10) LIMIT 100000000);</query> <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Array(Nullable(Int32))', 0, 10, 10) LIMIT 100000000);</query>

View File

@ -1,5 +1,5 @@
drop table if exists lc_dict_reading; drop table if exists lc_dict_reading;
create table lc_dict_reading (val UInt64, str StringWithDictionary, pat String) engine = MergeTree order by val SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'; create table lc_dict_reading (val UInt64, str LowCardinality(String), pat String) engine = MergeTree order by val SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into lc_dict_reading select number, if(number < 8192 * 4, number % 100, number) as s, s from system.numbers limit 1000000; insert into lc_dict_reading select number, if(number < 8192 * 4, number % 100, number) as s, s from system.numbers limit 1000000;
select sum(toUInt64(str)), sum(toUInt64(pat)) from lc_dict_reading where val < 8129 or val > 8192 * 4; select sum(toUInt64(str)), sum(toUInt64(pat)) from lc_dict_reading where val < 8129 or val > 8192 * 4;
drop table if exists lc_dict_reading; drop table if exists lc_dict_reading;

View File

@ -1,6 +1,6 @@
set allow_suspicious_low_cardinality_types = 1; set allow_suspicious_low_cardinality_types = 1;
drop table if exists lc_00688; drop table if exists lc_00688;
create table lc_00688 (str StringWithDictionary, val UInt8WithDictionary) engine = MergeTree order by tuple(); create table lc_00688 (str LowCardinality(String), val LowCardinality(UInt8)) engine = MergeTree order by tuple();
insert into lc_00688 values ('a', 1), ('b', 2); insert into lc_00688 values ('a', 1), ('b', 2);
select str, str in ('a', 'd') from lc_00688; select str, str in ('a', 'd') from lc_00688;
select val, val in (1, 3) from lc_00688; select val, val in (1, 3) from lc_00688;

View File

@ -1,5 +1,5 @@
drop table if exists lc_prewhere; drop table if exists lc_prewhere;
create table lc_prewhere (key UInt64, val UInt64, str StringWithDictionary, s String) engine = MergeTree order by key settings index_granularity = 8192; create table lc_prewhere (key UInt64, val UInt64, str LowCardinality(String), s String) engine = MergeTree order by key settings index_granularity = 8192;
insert into lc_prewhere select number, if(number < 10 or number > 8192 * 9, 1, 0), toString(number) as s, s from system.numbers limit 100000; insert into lc_prewhere select number, if(number < 10 or number > 8192 * 9, 1, 0), toString(number) as s, s from system.numbers limit 100000;
select sum(toUInt64(str)), sum(toUInt64(s)) from lc_prewhere prewhere val == 1; select sum(toUInt64(str)), sum(toUInt64(s)) from lc_prewhere prewhere val == 1;
drop table if exists lc_prewhere; drop table if exists lc_prewhere;

View File

@ -8,8 +8,8 @@ select 'MergeTree';
drop table if exists lc_small_dict; drop table if exists lc_small_dict;
drop table if exists lc_big_dict; drop table if exists lc_big_dict;
create table lc_small_dict (str StringWithDictionary) engine = MergeTree order by str SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'; create table lc_small_dict (str LowCardinality(String)) engine = MergeTree order by str SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
create table lc_big_dict (str StringWithDictionary) engine = MergeTree order by str SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'; create table lc_big_dict (str LowCardinality(String)) engine = MergeTree order by str SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
insert into lc_small_dict select toString(number % 1000) from system.numbers limit 1000000; insert into lc_small_dict select toString(number % 1000) from system.numbers limit 1000000;
insert into lc_big_dict select toString(number) from system.numbers limit 1000000; insert into lc_big_dict select toString(number) from system.numbers limit 1000000;

View File

@ -1,13 +1,7 @@
a a
a a
a
a
1 1
1 1
1
1
ab
ab
ab ab
ab ab
- -

View File

@ -13,56 +13,32 @@ drop table if exists lc_null_fix_str_0;
drop table if exists lc_null_fix_str_1; drop table if exists lc_null_fix_str_1;
create table lc_str_0 (str LowCardinality(String)) engine = Memory; create table lc_str_0 (str LowCardinality(String)) engine = Memory;
create table lc_str_1 (str StringWithDictionary) engine = Memory;
create table lc_null_str_0 (str LowCardinality(Nullable(String))) engine = Memory; create table lc_null_str_0 (str LowCardinality(Nullable(String))) engine = Memory;
create table lc_null_str_1 (str NullableWithDictionary(String)) engine = Memory;
create table lc_int8_0 (val LowCardinality(Int8)) engine = Memory; create table lc_int8_0 (val LowCardinality(Int8)) engine = Memory;
create table lc_int8_1 (val Int8WithDictionary) engine = Memory;
create table lc_null_int8_0 (val LowCardinality(Nullable(Int8))) engine = Memory; create table lc_null_int8_0 (val LowCardinality(Nullable(Int8))) engine = Memory;
create table lc_null_int8_1 (val NullableWithDictionary(Int8)) engine = Memory;
create table lc_fix_str_0 (str LowCardinality(FixedString(2))) engine = Memory; create table lc_fix_str_0 (str LowCardinality(FixedString(2))) engine = Memory;
create table lc_fix_str_1 (str FixedStringWithDictionary(2)) engine = Memory;
create table lc_null_fix_str_0 (str LowCardinality(Nullable(FixedString(2)))) engine = Memory; create table lc_null_fix_str_0 (str LowCardinality(Nullable(FixedString(2)))) engine = Memory;
create table lc_null_fix_str_1 (str NullableWithDictionary(FixedString(2))) engine = Memory;
insert into lc_str_0 select 'a'; insert into lc_str_0 select 'a';
insert into lc_str_1 select 'a';
insert into lc_null_str_0 select 'a'; insert into lc_null_str_0 select 'a';
insert into lc_null_str_1 select 'a';
insert into lc_int8_0 select 1; insert into lc_int8_0 select 1;
insert into lc_int8_1 select 1;
insert into lc_null_int8_0 select 1; insert into lc_null_int8_0 select 1;
insert into lc_null_int8_1 select 1;
insert into lc_fix_str_0 select 'ab'; insert into lc_fix_str_0 select 'ab';
insert into lc_fix_str_1 select 'ab';
insert into lc_null_fix_str_0 select 'ab'; insert into lc_null_fix_str_0 select 'ab';
insert into lc_null_fix_str_1 select 'ab';
select str from lc_str_0; select str from lc_str_0;
select str from lc_str_1;
select str from lc_null_str_0; select str from lc_null_str_0;
select str from lc_null_str_1;
select val from lc_int8_0; select val from lc_int8_0;
select val from lc_int8_1;
select val from lc_null_int8_0; select val from lc_null_int8_0;
select val from lc_null_int8_1;
select str from lc_fix_str_0; select str from lc_fix_str_0;
select str from lc_fix_str_1;
select str from lc_null_fix_str_0; select str from lc_null_fix_str_0;
select str from lc_null_fix_str_1;
drop table if exists lc_str_0; drop table if exists lc_str_0;
drop table if exists lc_str_1;
drop table if exists lc_null_str_0; drop table if exists lc_null_str_0;
drop table if exists lc_null_str_1;
drop table if exists lc_int8_0; drop table if exists lc_int8_0;
drop table if exists lc_int8_1;
drop table if exists lc_null_int8_0; drop table if exists lc_null_int8_0;
drop table if exists lc_null_int8_1;
drop table if exists lc_fix_str_0; drop table if exists lc_fix_str_0;
drop table if exists lc_fix_str_1;
drop table if exists lc_null_fix_str_0; drop table if exists lc_null_fix_str_0;
drop table if exists lc_null_fix_str_1;
select '-'; select '-';
SELECT toLowCardinality('a') AS s, toTypeName(s), toTypeName(length(s)) from system.one; SELECT toLowCardinality('a') AS s, toTypeName(s), toTypeName(length(s)) from system.one;
@ -73,7 +49,7 @@ select (toLowCardinality(z) as val) || 'b' from (select arrayJoin(['c', 'd']) a
select '-'; select '-';
drop table if exists lc_str_uuid; drop table if exists lc_str_uuid;
create table lc_str_uuid(str1 String, str2 LowCardinality(String), str3 StringWithDictionary) ENGINE=Memory; create table lc_str_uuid(str1 String, str2 LowCardinality(String), str3 LowCardinality(String)) ENGINE=Memory;
select toUUID(str1), toUUID(str2), toUUID(str3) from lc_str_uuid; select toUUID(str1), toUUID(str2), toUUID(str3) from lc_str_uuid;
select toUUID(str1, '', NULL), toUUID(str2, '', NULL), toUUID(str3, '', NULL) from lc_str_uuid; select toUUID(str1, '', NULL), toUUID(str2, '', NULL), toUUID(str3, '', NULL) from lc_str_uuid;
insert into lc_str_uuid values ('61f0c404-5cb3-11e7-907b-a6006ad3dba0', '61f0c404-5cb3-11e7-907b-a6006ad3dba0', '61f0c404-5cb3-11e7-907b-a6006ad3dba0'); insert into lc_str_uuid values ('61f0c404-5cb3-11e7-907b-a6006ad3dba0', '61f0c404-5cb3-11e7-907b-a6006ad3dba0', '61f0c404-5cb3-11e7-907b-a6006ad3dba0');

View File

@ -1,5 +1,5 @@
drop table if exists tab_00717; drop table if exists tab_00717;
create table tab_00717 (a String, b StringWithDictionary) engine = MergeTree order by a; create table tab_00717 (a String, b LowCardinality(String)) engine = MergeTree order by a;
insert into tab_00717 values ('a_1', 'b_1'), ('a_2', 'b_2'); insert into tab_00717 values ('a_1', 'b_1'), ('a_2', 'b_2');
select count() from tab_00717; select count() from tab_00717;
select a from tab_00717 group by a order by a; select a from tab_00717 group by a order by a;

View File

@ -7,7 +7,7 @@ alter table tab_00718 modify column b UInt32;
select *, toTypeName(b) from tab_00718; select *, toTypeName(b) from tab_00718;
alter table tab_00718 modify column b LowCardinality(UInt32); alter table tab_00718 modify column b LowCardinality(UInt32);
select *, toTypeName(b) from tab_00718; select *, toTypeName(b) from tab_00718;
alter table tab_00718 modify column b StringWithDictionary; alter table tab_00718 modify column b LowCardinality(String);
select *, toTypeName(b) from tab_00718; select *, toTypeName(b) from tab_00718;
alter table tab_00718 modify column b LowCardinality(UInt32); alter table tab_00718 modify column b LowCardinality(UInt32);
select *, toTypeName(b) from tab_00718; select *, toTypeName(b) from tab_00718;

View File

@ -1,7 +1,7 @@
drop table if exists lc_00752; drop table if exists lc_00752;
drop table if exists lc_mv_00752; drop table if exists lc_mv_00752;
create table lc_00752 (str StringWithDictionary) engine = MergeTree order by tuple(); create table lc_00752 (str LowCardinality(String)) engine = MergeTree order by tuple();
insert into lc_00752 values ('a'), ('bbb'), ('ab'), ('accccc'), ('baasddas'), ('bcde'); insert into lc_00752 values ('a'), ('bbb'), ('ab'), ('accccc'), ('baasddas'), ('bcde');
@ -12,4 +12,3 @@ select * from lc_mv_00752 order by letter;
drop table if exists lc_00752; drop table if exists lc_00752;
drop table if exists lc_mv_00752; drop table if exists lc_mv_00752;

View File

@ -11,12 +11,17 @@ $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS mem"
$CLICKHOUSE_CLIENT -q "CREATE TABLE mem (x UInt64) engine = Memory" $CLICKHOUSE_CLIENT -q "CREATE TABLE mem (x UInt64) engine = Memory"
function f { function f {
local TIMELIMIT=$((SECONDS+$1))
for _ in $(seq 1 300); do for _ in $(seq 1 300); do
$CLICKHOUSE_CLIENT -q "SELECT count() FROM (SELECT * FROM mem SETTINGS max_threads=2) FORMAT Null;" $CLICKHOUSE_CLIENT -q "SELECT count() FROM (SELECT * FROM mem SETTINGS max_threads=2) FORMAT Null;"
if [ $SECONDS -ge "$TIMELIMIT" ]; then
break
fi
done done
} }
function g { function g {
local TIMELIMIT=$((SECONDS+$1))
for _ in $(seq 1 100); do for _ in $(seq 1 100); do
$CLICKHOUSE_CLIENT -n -q " $CLICKHOUSE_CLIENT -n -q "
INSERT INTO mem SELECT number FROM numbers(1000000); INSERT INTO mem SELECT number FROM numbers(1000000);
@ -30,14 +35,18 @@ function g {
INSERT INTO mem VALUES (1); INSERT INTO mem VALUES (1);
TRUNCATE TABLE mem; TRUNCATE TABLE mem;
" "
if [ $SECONDS -ge "$TIMELIMIT" ]; then
break
fi
done done
} }
export -f f; export -f f;
export -f g; export -f g;
timeout 20 bash -c f > /dev/null & TIMEOUT=20
timeout 20 bash -c g > /dev/null & f $TIMEOUT &
g $TIMEOUT &
wait wait
$CLICKHOUSE_CLIENT -q "DROP TABLE mem" $CLICKHOUSE_CLIENT -q "DROP TABLE mem"

View File

@ -28,7 +28,7 @@ ORDER BY tuple();
INSERT INTO t_01411_num (num) SELECT number % 1000 FROM numbers(100000); INSERT INTO t_01411_num (num) SELECT number % 1000 FROM numbers(100000);
create table lc_dict_reading (val UInt64, str StringWithDictionary, pat String) engine = MergeTree order by val; create table lc_dict_reading (val UInt64, str LowCardinality(String), pat String) engine = MergeTree order by val;
insert into lc_dict_reading select number, if(number < 8192 * 4, number % 100, number) as s, s from system.numbers limit 100000; insert into lc_dict_reading select number, if(number < 8192 * 4, number % 100, number) as s, s from system.numbers limit 100000;
""" """

View File

@ -0,0 +1,13 @@
-- Tags: no-parallel
-- Tag no-parallel: Messes with internal cache
SYSTEM DROP QUERY CACHE;
-- Create an entry in the query cache
SELECT 1 SETTINGS use_query_cache = true;
-- Asynchronous metrics must know about the entry
SYSTEM RELOAD ASYNCHRONOUS METRICS;
SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries';
SYSTEM DROP QUERY CACHE;

View File

@ -3,7 +3,11 @@ all_1_1_0
all_2_2_0 all_2_2_0
all_3_3_0 all_3_3_0
all_4_4_0 all_4_4_0
40000
5000 all_1_1_0_9 5000 all_1_1_0_9
5000 all_2_2_0_9 5000 all_2_2_0_9
5000 all_3_3_0_9 5000 all_3_3_0_9
5000 all_4_4_0_9 5000 all_4_4_0_9
mutation_version has_parts_for_which_set_was_built has_parts_that_shared_set
8 1 1
9 1 1

View File

@ -18,12 +18,35 @@ SELECT name FROM system.parts WHERE database=currentDatabase() AND table = '0258
-- Start multiple mutations simultaneously -- Start multiple mutations simultaneously
SYSTEM STOP MERGES 02581_trips; SYSTEM STOP MERGES 02581_trips;
ALTER TABLE 02581_trips UPDATE description='5' WHERE id IN (SELECT (number*10 + 5)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=0; ALTER TABLE 02581_trips UPDATE description='5' WHERE id IN (SELECT (number*10 + 5)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=0;
ALTER TABLE 02581_trips UPDATE description='6' WHERE id IN (SELECT (number*10 + 6)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=0; ALTER TABLE 02581_trips UPDATE description='6' WHERE id IN (SELECT (number*10 + 6)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=0;
ALTER TABLE 02581_trips DELETE WHERE id IN (SELECT (number*10 + 7)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=0; ALTER TABLE 02581_trips DELETE WHERE id IN (SELECT (number*10 + 7)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=0;
ALTER TABLE 02581_trips UPDATE description='8' WHERE id IN (SELECT (number*10 + 8)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=0; ALTER TABLE 02581_trips UPDATE description='8' WHERE id IN (SELECT (number*10 + 8)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=0;
SYSTEM START MERGES 02581_trips; SYSTEM START MERGES 02581_trips;
DELETE FROM 02581_trips WHERE id IN (SELECT (number*10 + 9)::UInt32 FROM numbers(200000000));
SELECT count(), _part from 02581_trips WHERE description = '' GROUP BY _part ORDER BY _part; -- Wait for mutations to finish
SELECT count() FROM 02581_trips SETTINGS select_sequential_consistency = 1;
DELETE FROM 02581_trips WHERE id IN (SELECT (number*10 + 9)::UInt32 FROM numbers(10000000)) SETTINGS lightweight_deletes_sync = 2;
SELECT count(), _part from 02581_trips WHERE description = '' GROUP BY _part ORDER BY _part SETTINGS select_sequential_consistency=1;
SYSTEM FLUSH LOGS;
-- Check that in every mutation there were parts that built sets (log messages like 'Created Set with 10000000 entries from 10000000 rows in 0.388989187 sec.' )
-- and parts that shared sets (log messages like 'Got set from cache in 0.388930505 sec.' )
WITH (
SELECT uuid
FROM system.tables
WHERE (database = currentDatabase()) AND (name = '02581_trips')
) AS table_uuid
SELECT
CAST(splitByChar('_', query_id)[5], 'UInt64') AS mutation_version, -- '5521485f-8a40-4aba-87a2-00342c369563::all_3_3_0_6'
sum(message LIKE 'Created Set with % entries%') >= 1 AS has_parts_for_which_set_was_built,
sum(message LIKE 'Got set from cache%') >= 1 AS has_parts_that_shared_set
FROM system.text_log
WHERE
query_id LIKE concat(CAST(table_uuid, 'String'), '::all\\_%')
AND (event_date >= yesterday())
AND (message LIKE 'Created Set with % entries%' OR message LIKE 'Got set from cache%')
GROUP BY mutation_version ORDER BY mutation_version FORMAT TSVWithNames;
DROP TABLE 02581_trips; DROP TABLE 02581_trips;

View File

@ -10,3 +10,11 @@ all_4_4_0
20000 20000
16000 16000
12000 12000
mutation_version has_parts_for_which_set_was_built has_parts_that_shared_set
5 1 1
6 1 1
7 1 1
8 1 1
9 1 1
10 1 1
11 1 1

View File

@ -18,42 +18,63 @@ SELECT count() from 02581_trips WHERE description = '';
SELECT name FROM system.parts WHERE database=currentDatabase() AND table = '02581_trips' AND active ORDER BY name; SELECT name FROM system.parts WHERE database=currentDatabase() AND table = '02581_trips' AND active ORDER BY name;
-- Run mutation with `id` a 'IN big subquery' -- Run mutation with `id` a 'IN big subquery'
ALTER TABLE 02581_trips UPDATE description='a' WHERE id IN (SELECT (number*10)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=2; ALTER TABLE 02581_trips UPDATE description='a' WHERE id IN (SELECT (number*10)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
ALTER TABLE 02581_trips UPDATE description='a' WHERE id IN (SELECT (number*10 + 1)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=2, max_rows_in_set=1000; ALTER TABLE 02581_trips UPDATE description='a' WHERE id IN (SELECT (number*10 + 1)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=2, max_rows_in_set=1000;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
-- Run mutation with func(`id`) IN big subquery -- Run mutation with func(`id`) IN big subquery
ALTER TABLE 02581_trips UPDATE description='b' WHERE id::UInt64 IN (SELECT (number*10 + 2)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=2; ALTER TABLE 02581_trips UPDATE description='b' WHERE id::UInt64 IN (SELECT (number*10 + 2)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
-- Run mutation with non-PK `id2` IN big subquery -- Run mutation with non-PK `id2` IN big subquery
ALTER TABLE 02581_trips UPDATE description='c' WHERE id2 IN (SELECT (number*10 + 3)::UInt32 FROM numbers(200000000)) SETTINGS mutations_sync=2; --SELECT count(), _part FROM 02581_trips WHERE id2 IN (SELECT (number*10 + 3)::UInt32 FROM numbers(10000000)) GROUP BY _part ORDER BY _part;
--EXPLAIN SELECT (), _part FROM 02581_trips WHERE id2 IN (SELECT (number*10 + 3)::UInt32 FROM numbers(10000000));
ALTER TABLE 02581_trips UPDATE description='c' WHERE id2 IN (SELECT (number*10 + 3)::UInt32 FROM numbers(10000000)) SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
-- Run mutation with PK and non-PK IN big subquery -- Run mutation with PK and non-PK IN big subquery
ALTER TABLE 02581_trips UPDATE description='c' ALTER TABLE 02581_trips UPDATE description='c'
WHERE WHERE
(id IN (SELECT (number*10 + 4)::UInt32 FROM numbers(200000000))) OR (id IN (SELECT (number*10 + 4)::UInt32 FROM numbers(10000000))) OR
(id2 IN (SELECT (number*10 + 4)::UInt32 FROM numbers(200000000))) (id2 IN (SELECT (number*10 + 4)::UInt32 FROM numbers(10000000)))
SETTINGS mutations_sync=2; SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
-- Run mutation with PK and non-PK IN big subquery -- Run mutation with PK and non-PK IN big subquery
ALTER TABLE 02581_trips UPDATE description='c' ALTER TABLE 02581_trips UPDATE description='c'
WHERE WHERE
(id::UInt64 IN (SELECT (number*10 + 5)::UInt32 FROM numbers(200000000))) OR (id::UInt64 IN (SELECT (number*10 + 5)::UInt32 FROM numbers(10000000))) OR
(id2::UInt64 IN (SELECT (number*10 + 5)::UInt32 FROM numbers(200000000))) (id2::UInt64 IN (SELECT (number*10 + 5)::UInt32 FROM numbers(10000000)))
SETTINGS mutations_sync=2; SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
-- Run mutation with PK and non-PK IN big subquery -- Run mutation with PK and non-PK IN big subquery
ALTER TABLE 02581_trips UPDATE description='c' ALTER TABLE 02581_trips UPDATE description='c'
WHERE WHERE
(id::UInt32 IN (SELECT (number*10 + 6)::UInt32 FROM numbers(200000000))) OR (id::UInt32 IN (SELECT (number*10 + 6)::UInt32 FROM numbers(10000000))) OR
((id2+1)::String IN (SELECT (number*10 + 6)::UInt32 FROM numbers(200000000))) ((id2+1)::String IN (SELECT (number*10 + 6)::UInt32 FROM numbers(10000000)))
SETTINGS mutations_sync=2; SETTINGS mutations_sync=2;
SELECT count() from 02581_trips WHERE description = ''; SELECT count() from 02581_trips WHERE description = '';
SYSTEM FLUSH LOGS;
-- Check that in every mutation there were parts that built sets (log messages like 'Created Set with 10000000 entries from 10000000 rows in 0.388989187 sec.' )
-- and parts that shared sets (log messages like 'Got set from cache in 0.388930505 sec.' )
WITH (
SELECT uuid
FROM system.tables
WHERE (database = currentDatabase()) AND (name = '02581_trips')
) AS table_uuid
SELECT
CAST(splitByChar('_', query_id)[5], 'UInt64') AS mutation_version, -- '5521485f-8a40-4aba-87a2-00342c369563::all_3_3_0_6'
sum(message LIKE 'Created Set with % entries%') >= 1 AS has_parts_for_which_set_was_built,
sum(message LIKE 'Got set from cache%') >= 1 AS has_parts_that_shared_set
FROM system.text_log
WHERE
query_id LIKE concat(CAST(table_uuid, 'String'), '::all\\_%')
AND (event_date >= yesterday())
AND (message LIKE 'Created Set with % entries%' OR message LIKE 'Got set from cache%')
GROUP BY mutation_version ORDER BY mutation_version FORMAT TSVWithNames;
DROP TABLE 02581_trips; DROP TABLE 02581_trips;

View File

@ -0,0 +1,14 @@
CREATE TABLE default.tab\n(\n `a` String,\n `b` UInt64,\n `c` Int64,\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192
Test statistics count_min:
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'0\'), equals(b, 0), equals(c, 0)) (removed)
Test statistics multi-types:
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'0\'), less(c, -90), greater(b, 900)) (removed)
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'10000\'), equals(b, 0), less(c, 0)) (removed)
Test LowCardinality and Nullable data type:
tab2

View File

@ -0,0 +1,70 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS tab SYNC;
SET allow_experimental_statistics = 1;
SET allow_statistics_optimize = 1;
SET allow_suspicious_low_cardinality_types=1;
SET mutations_sync = 2;
CREATE TABLE tab
(
a String,
b UInt64,
c Int64,
pk String,
) Engine = MergeTree() ORDER BY pk
SETTINGS min_bytes_for_wide_part = 0;
SHOW CREATE TABLE tab;
INSERT INTO tab select toString(number % 10000), number % 1000, -(number % 100), generateUUIDv4() FROM system.numbers LIMIT 10000;
SELECT 'Test statistics count_min:';
ALTER TABLE tab ADD STATISTICS a TYPE count_min;
ALTER TABLE tab ADD STATISTICS b TYPE count_min;
ALTER TABLE tab ADD STATISTICS c TYPE count_min;
ALTER TABLE tab MATERIALIZE STATISTICS a, b, c;
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE c = 0/*100*/ and b = 0/*10*/ and a = '0'/*1*/) xx
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS a, b, c;
SELECT 'Test statistics multi-types:';
ALTER TABLE tab ADD STATISTICS a TYPE count_min;
ALTER TABLE tab ADD STATISTICS b TYPE count_min, uniq, tdigest;
ALTER TABLE tab ADD STATISTICS c TYPE count_min, uniq, tdigest;
ALTER TABLE tab MATERIALIZE STATISTICS a, b, c;
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE c < -90/*900*/ and b > 900/*990*/ and a = '0'/*1*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE c < 0/*9900*/ and b = 0/*10*/ and a = '10000'/*0*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS a, b, c;
DROP TABLE IF EXISTS tab SYNC;
SELECT 'Test LowCardinality and Nullable data type:';
DROP TABLE IF EXISTS tab2 SYNC;
SET allow_suspicious_low_cardinality_types=1;
CREATE TABLE tab2
(
a LowCardinality(Int64) STATISTICS(count_min),
b Nullable(Int64) STATISTICS(count_min),
c LowCardinality(Nullable(Int64)) STATISTICS(count_min),
pk String,
) Engine = MergeTree() ORDER BY pk;
select name from system.tables where name = 'tab2' and database = currentDatabase();
DROP TABLE IF EXISTS tab2 SYNC;

View File

@ -70,3 +70,4 @@ SETTINGS min_bytes_for_wide_part = 0;
INSERT INTO t3 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; INSERT INTO t3 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000;
DROP TABLE IF EXISTS t3; DROP TABLE IF EXISTS t3;

View File

@ -2,8 +2,6 @@
# Tags: atomic-database # Tags: atomic-database
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh . "$CUR_DIR"/../shell_config.sh
@ -134,7 +132,7 @@ while [ "`$CLICKHOUSE_CLIENT -nq "select status, next_refresh_time from refreshe
do do
sleep 0.1 sleep 0.1
done done
sleep 1
$CLICKHOUSE_CLIENT -nq " $CLICKHOUSE_CLIENT -nq "
select '<14: waiting for next cycle>', view, status, remaining_dependencies, next_refresh_time from refreshes; select '<14: waiting for next cycle>', view, status, remaining_dependencies, next_refresh_time from refreshes;
truncate src; truncate src;
@ -172,13 +170,13 @@ $CLICKHOUSE_CLIENT -nq "
drop table b; drop table b;
create materialized view c refresh every 1 second (x Int64) engine Memory empty as select * from src; create materialized view c refresh every 1 second (x Int64) engine Memory empty as select * from src;
drop table src;" drop table src;"
while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Exception' ] while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes where view = 'c' -- $LINENO" | xargs`" != 'Exception' ]
do do
sleep 0.1 sleep 0.1
done done
# Check exception, create src, expect successful refresh. # Check exception, create src, expect successful refresh.
$CLICKHOUSE_CLIENT -nq " $CLICKHOUSE_CLIENT -nq "
select '<19: exception>', exception ilike '%UNKNOWN_TABLE%' from refreshes; select '<19: exception>', exception ilike '%UNKNOWN_TABLE%' ? '1' : exception from refreshes where view = 'c';
create table src (x Int64) engine Memory as select 1; create table src (x Int64) engine Memory as select 1;
system refresh view c;" system refresh view c;"
while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Finished' ] while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Finished' ]
@ -224,22 +222,27 @@ done
$CLICKHOUSE_CLIENT -nq " $CLICKHOUSE_CLIENT -nq "
rename table e to f; rename table e to f;
select '<24: rename during refresh>', * from f; select '<24: rename during refresh>', * from f;
select '<25: rename during refresh>', view, status from refreshes; select '<25: rename during refresh>', view, status from refreshes where view = 'f';
alter table f modify refresh after 10 year;" alter table f modify refresh after 10 year;"
sleep 2 # make it likely that at least one row was processed
# Cancel. # Cancel.
$CLICKHOUSE_CLIENT -nq " $CLICKHOUSE_CLIENT -nq "
system cancel view f;" system cancel view f;"
while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Cancelled' ] while [ "`$CLICKHOUSE_CLIENT -nq "select last_refresh_result from refreshes where view = 'f' -- $LINENO" | xargs`" != 'Cancelled' ]
do do
sleep 0.1 sleep 0.1
done done
while [ "`$CLICKHOUSE_CLIENT -nq "select status from refreshes where view = 'f' -- $LINENO" | xargs`" = 'Running' ]
do
sleep 0.1
done
# Check that another refresh doesn't immediately start after the cancelled one. # Check that another refresh doesn't immediately start after the cancelled one.
sleep 1
$CLICKHOUSE_CLIENT -nq " $CLICKHOUSE_CLIENT -nq "
select '<27: cancelled>', view, status from refreshes; select '<27: cancelled>', view, status from refreshes where view = 'f';
system refresh view f;" system refresh view f;"
while [ "`$CLICKHOUSE_CLIENT -nq "select status from refreshes -- $LINENO" | xargs`" != 'Running' ] while [ "`$CLICKHOUSE_CLIENT -nq "select status from refreshes where view = 'f' -- $LINENO" | xargs`" != 'Running' ]
do do
sleep 0.1 sleep 0.1
done done

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
touch $CLICKHOUSE_TEST_UNIQUE_NAME.xml
$CLICKHOUSE_LOCAL -q "select * from file('$CLICKHOUSE_TEST_UNIQUE_NAME.*')" 2>&1 | grep -c "CANNOT_DETECT_FORMAT"
rm $CLICKHOUSE_TEST_UNIQUE_NAME.xml

View File

@ -1,14 +1,4 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1"
function test() function test()
{ {
@ -43,20 +33,3 @@ function test()
$CH_CLIENT -q "select d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Dynamic)\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" $CH_CLIENT -q "select d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Dynamic)\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null"
$CH_CLIENT -q "select d.\`Array(Array(Dynamic))\`.size1, d.\`Array(Array(Dynamic))\`.UInt64, d.\`Array(Array(Dynamic))\`.\`Map(String, Tuple(a UInt64))\`.values.a from test format Null" $CH_CLIENT -q "select d.\`Array(Array(Dynamic))\`.size1, d.\`Array(Array(Dynamic))\`.UInt64, d.\`Array(Array(Dynamic))\`.\`Map(String, Tuple(a UInt64))\`.values.a from test format Null"
} }
$CH_CLIENT -q "drop table if exists test;"
echo "Memory"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=Memory"
test
$CH_CLIENT -q "drop table test;"
echo "MergeTree compact"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;"
test
$CH_CLIENT -q "drop table test;"
echo "MergeTree wide"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;"
test
$CH_CLIENT -q "drop table test;"

View File

@ -1,57 +0,0 @@
Memory
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0
MergeTree compact
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0
MergeTree wide
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,19 @@
Memory
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# shellcheck source=./03036_dynamic_read_subcolumns.lib
. "$CUR_DIR"/03036_dynamic_read_subcolumns.lib
CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1"
$CH_CLIENT -q "drop table if exists test;"
echo "Memory"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=Memory"
test
$CH_CLIENT -q "drop table test;"

View File

@ -0,0 +1,19 @@
MergeTree compact
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# shellcheck source=./03036_dynamic_read_subcolumns.lib
. "$CUR_DIR"/03036_dynamic_read_subcolumns.lib
CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1"
$CH_CLIENT -q "drop table if exists test;"
echo "MergeTree compact"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;"
test
$CH_CLIENT -q "drop table test;"

View File

@ -0,0 +1,19 @@
MergeTree wide
test
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# shellcheck source=./03036_dynamic_read_subcolumns.lib
. "$CUR_DIR"/03036_dynamic_read_subcolumns.lib
CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1"
$CH_CLIENT -q "drop table if exists test;"
echo "MergeTree wide"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;"
test
$CH_CLIENT -q "drop table test;"

View File

@ -0,0 +1,17 @@
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,40 @@
-- Tags: long
set allow_experimental_variant_type = 1;
set use_variant_as_common_type = 1;
set allow_experimental_dynamic_type = 1;
drop table if exists test;
create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;
insert into test select number, number from numbers(100000) settings min_insert_block_size_rows=50000;
insert into test select number, 'str_' || toString(number) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1)) from numbers(200000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, NULL from numbers(300000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, multiIf(number % 4 == 3, 'str_' || toString(number), number % 4 == 2, NULL, number % 4 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1))) from numbers(400000, 400000) settings min_insert_block_size_rows=50000;
insert into test select number, [range((number % 10 + 1)::UInt64)]::Array(Array(Dynamic)) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
select distinct dynamicType(d) as type from test order by type;
select count() from test where dynamicType(d) == 'UInt64';
select count() from test where d.UInt64 is not NULL;
select count() from test where dynamicType(d) == 'String';
select count() from test where d.String is not NULL;
select count() from test where dynamicType(d) == 'Date';
select count() from test where d.Date is not NULL;
select count() from test where dynamicType(d) == 'Array(Variant(String, UInt64))';
select count() from test where not empty(d.`Array(Variant(String, UInt64))`);
select count() from test where dynamicType(d) == 'Array(Array(Dynamic))';
select count() from test where not empty(d.`Array(Array(Dynamic))`);
select count() from test where d is NULL;
select count() from test where not empty(d.`Tuple(a Array(Dynamic))`.a.String);
select d, d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.Int8, d.Date, d.`Array(String)` from test format Null;
select d, d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64, d.`Array(Variant(String, UInt64))`.String from test format Null;
select d, d.`Tuple(a UInt64, b String)`.a, d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Dynamic)`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Array(Dynamic))`.size1, d.`Array(Array(Dynamic))`.UInt64, d.`Array(Array(Dynamic))`.`Map(String, Tuple(a UInt64))`.values.a from test format Null;
drop table test;

View File

@ -0,0 +1,17 @@
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,40 @@
-- Tags: long
set allow_experimental_variant_type = 1;
set use_variant_as_common_type = 1;
set allow_experimental_dynamic_type = 1;
drop table if exists test;
create table test (id UInt64, d Dynamic) engine=Memory;
insert into test select number, number from numbers(100000) settings min_insert_block_size_rows=50000;
insert into test select number, 'str_' || toString(number) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1)) from numbers(200000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, NULL from numbers(300000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, multiIf(number % 4 == 3, 'str_' || toString(number), number % 4 == 2, NULL, number % 4 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1))) from numbers(400000, 400000) settings min_insert_block_size_rows=50000;
insert into test select number, [range((number % 10 + 1)::UInt64)]::Array(Array(Dynamic)) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
select distinct dynamicType(d) as type from test order by type;
select count() from test where dynamicType(d) == 'UInt64';
select count() from test where d.UInt64 is not NULL;
select count() from test where dynamicType(d) == 'String';
select count() from test where d.String is not NULL;
select count() from test where dynamicType(d) == 'Date';
select count() from test where d.Date is not NULL;
select count() from test where dynamicType(d) == 'Array(Variant(String, UInt64))';
select count() from test where not empty(d.`Array(Variant(String, UInt64))`);
select count() from test where dynamicType(d) == 'Array(Array(Dynamic))';
select count() from test where not empty(d.`Array(Array(Dynamic))`);
select count() from test where d is NULL;
select count() from test where not empty(d.`Tuple(a Array(Dynamic))`.a.String);
select d, d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.Int8, d.Date, d.`Array(String)` from test format Null;
select d, d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64, d.`Array(Variant(String, UInt64))`.String from test format Null;
select d, d.`Tuple(a UInt64, b String)`.a, d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Dynamic)`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Array(Dynamic))`.size1, d.`Array(Array(Dynamic))`.UInt64, d.`Array(Array(Dynamic))`.`Map(String, Tuple(a UInt64))`.values.a from test format Null;
drop table test;

View File

@ -0,0 +1,17 @@
Array(Array(Dynamic))
Array(Variant(String, UInt64))
None
String
UInt64
200000
200000
200000
200000
0
0
200000
200000
100000
100000
200000
0

View File

@ -0,0 +1,40 @@
-- Tags: long
set allow_experimental_variant_type = 1;
set use_variant_as_common_type = 1;
set allow_experimental_dynamic_type = 1;
drop table if exists test;
create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;
insert into test select number, number from numbers(100000) settings min_insert_block_size_rows=50000;
insert into test select number, 'str_' || toString(number) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1)) from numbers(200000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, NULL from numbers(300000, 100000) settings min_insert_block_size_rows=50000;
insert into test select number, multiIf(number % 4 == 3, 'str_' || toString(number), number % 4 == 2, NULL, number % 4 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1))) from numbers(400000, 400000) settings min_insert_block_size_rows=50000;
insert into test select number, [range((number % 10 + 1)::UInt64)]::Array(Array(Dynamic)) from numbers(100000, 100000) settings min_insert_block_size_rows=50000;
select distinct dynamicType(d) as type from test order by type;
select count() from test where dynamicType(d) == 'UInt64';
select count() from test where d.UInt64 is not NULL;
select count() from test where dynamicType(d) == 'String';
select count() from test where d.String is not NULL;
select count() from test where dynamicType(d) == 'Date';
select count() from test where d.Date is not NULL;
select count() from test where dynamicType(d) == 'Array(Variant(String, UInt64))';
select count() from test where not empty(d.`Array(Variant(String, UInt64))`);
select count() from test where dynamicType(d) == 'Array(Array(Dynamic))';
select count() from test where not empty(d.`Array(Array(Dynamic))`);
select count() from test where d is NULL;
select count() from test where not empty(d.`Tuple(a Array(Dynamic))`.a.String);
select d, d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.UInt64, d.String, d.`Array(Variant(String, UInt64))` from test format Null;
select d.Int8, d.Date, d.`Array(String)` from test format Null;
select d, d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.UInt64, d.Date, d.`Array(Variant(String, UInt64))`, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64, d.`Array(Variant(String, UInt64))`.String from test format Null;
select d, d.`Tuple(a UInt64, b String)`.a, d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64, d.`Array(Dynamic)`.size0, d.`Array(Variant(String, UInt64))`.UInt64 from test format Null;
select d.`Array(Array(Dynamic))`.size1, d.`Array(Array(Dynamic))`.UInt64, d.`Array(Array(Dynamic))`.`Map(String, Tuple(a UInt64))`.values.a from test format Null;
drop table test;

View File

@ -1,60 +0,0 @@
MergeTree compact
test
50000 DateTime
60000 Date
70000 Array(UInt16)
80000 String
100000 None
100000 UInt64
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
200000 Map(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
10000 Tuple(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
270000 String
MergeTree wide
test
50000 DateTime
60000 Date
70000 Array(UInt16)
80000 String
100000 None
100000 UInt64
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
200000 Map(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
10000 Tuple(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
270000 String

View File

@ -1,52 +0,0 @@
#!/usr/bin/env bash
# Tags: long
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# reset --log_comment
CLICKHOUSE_LOG_COMMENT=
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1"
function test()
{
echo "test"
$CH_CLIENT -q "system stop merges test"
$CH_CLIENT -q "insert into test select number, number from numbers(100000)"
$CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(80000)"
$CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(70000)"
$CH_CLIENT -q "insert into test select number, toDate(number) from numbers(60000)"
$CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)"
$CH_CLIENT -q "insert into test select number, NULL from numbers(100000)"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
$CH_CLIENT -nm -q "system start merges test; optimize table test final;"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
$CH_CLIENT -q "system stop merges test"
$CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
$CH_CLIENT -nm -q "system start merges test; optimize table test final;"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
$CH_CLIENT -q "system stop merges test"
$CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
$CH_CLIENT -nm -q "system start merges test; optimize table test final;"
$CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)"
}
$CH_CLIENT -q "drop table if exists test;"
echo "MergeTree compact"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;"
test
$CH_CLIENT -q "drop table test;"
echo "MergeTree wide"
$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;"
test
$CH_CLIENT -q "drop table test;"

View File

@ -0,0 +1,28 @@
50000 DateTime
60000 Date
70000 Array(UInt16)
80000 String
100000 None
100000 UInt64
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
200000 Map(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
10000 Tuple(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
270000 String

View File

@ -0,0 +1,33 @@
-- Tags: long
set allow_experimental_dynamic_type=1;
drop table if exists test;
create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;
system stop merges test;
insert into test select number, number from numbers(100000);
insert into test select number, 'str_' || toString(number) from numbers(80000);
insert into test select number, range(number % 10 + 1) from numbers(70000);
insert into test select number, toDate(number) from numbers(60000);
insert into test select number, toDateTime(number) from numbers(50000);
insert into test select number, NULL from numbers(100000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test; optimize table test final;;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system stop merges test;
insert into test select number, map(number, number) from numbers(200000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test;
optimize table test final;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system stop merges test;
insert into test select number, tuple(number, number) from numbers(10000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test;
optimize table test final;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
drop table test;

View File

@ -0,0 +1,28 @@
50000 DateTime
60000 Date
70000 Array(UInt16)
80000 String
100000 None
100000 UInt64
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
70000 Array(UInt16)
100000 None
100000 UInt64
190000 String
200000 Map(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
10000 Tuple(UInt64, UInt64)
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
260000 String
100000 None
100000 UInt64
200000 Map(UInt64, UInt64)
270000 String

View File

@ -0,0 +1,33 @@
-- Tags: long
set allow_experimental_dynamic_type=1;
drop table if exists test;
create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;
system stop merges test;
insert into test select number, number from numbers(100000);
insert into test select number, 'str_' || toString(number) from numbers(80000);
insert into test select number, range(number % 10 + 1) from numbers(70000);
insert into test select number, toDate(number) from numbers(60000);
insert into test select number, toDateTime(number) from numbers(50000);
insert into test select number, NULL from numbers(100000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test; optimize table test final;;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system stop merges test;
insert into test select number, map(number, number) from numbers(200000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test;
optimize table test final;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system stop merges test;
insert into test select number, tuple(number, number) from numbers(10000);
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
system start merges test;
optimize table test final;
select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d);
drop table test;

Some files were not shown because too many files have changed in this diff Show More