mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-10-22 08:20:48 +00:00
Merge branch 'master' of github.com:yandex/ClickHouse
This commit is contained in:
commit
8b55348a45
1
.gitignore
vendored
1
.gitignore
vendored
@ -90,7 +90,6 @@ dbms/src/Core/tests/field
|
|||||||
dbms/src/Core/tests/rvo_test
|
dbms/src/Core/tests/rvo_test
|
||||||
dbms/src/Core/tests/string_pool
|
dbms/src/Core/tests/string_pool
|
||||||
dbms/src/DataStreams/tests/aggregating_stream
|
dbms/src/DataStreams/tests/aggregating_stream
|
||||||
dbms/src/DataStreams/tests/block_row_transforms
|
|
||||||
dbms/src/DataStreams/tests/block_tab_separated_streams
|
dbms/src/DataStreams/tests/block_tab_separated_streams
|
||||||
dbms/src/DataStreams/tests/collapsing_sorted_stream
|
dbms/src/DataStreams/tests/collapsing_sorted_stream
|
||||||
dbms/src/DataStreams/tests/expression_stream
|
dbms/src/DataStreams/tests/expression_stream
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
#include <Common/StringUtils/StringUtils.h>
|
#include <Common/StringUtils/StringUtils.h>
|
||||||
|
|
||||||
#include <common/phdr_cache.h>
|
#include <common/phdr_cache.h>
|
||||||
|
#include <ext/scope_guard.h>
|
||||||
|
|
||||||
|
|
||||||
/// Universal executable for various clickhouse applications
|
/// Universal executable for various clickhouse applications
|
||||||
@ -130,8 +131,19 @@ bool isClickhouseApp(const std::string & app_suffix, std::vector<char *> & argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// This allows to implement assert to forbid initialization of a class in static constructors.
|
||||||
|
/// Usage:
|
||||||
|
///
|
||||||
|
/// extern bool inside_main;
|
||||||
|
/// class C { C() { assert(inside_main); } };
|
||||||
|
bool inside_main = false;
|
||||||
|
|
||||||
|
|
||||||
int main(int argc_, char ** argv_)
|
int main(int argc_, char ** argv_)
|
||||||
{
|
{
|
||||||
|
inside_main = true;
|
||||||
|
SCOPE_EXIT({ inside_main = false; });
|
||||||
|
|
||||||
/// Reset new handler to default (that throws std::bad_alloc)
|
/// Reset new handler to default (that throws std::bad_alloc)
|
||||||
/// It is needed because LLVM library clobbers it.
|
/// It is needed because LLVM library clobbers it.
|
||||||
std::set_new_handler(nullptr);
|
std::set_new_handler(nullptr);
|
||||||
|
@ -447,6 +447,8 @@ namespace ErrorCodes
|
|||||||
extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW = 470;
|
extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW = 470;
|
||||||
extern const int SETTINGS_ARE_NOT_SUPPORTED = 471;
|
extern const int SETTINGS_ARE_NOT_SUPPORTED = 471;
|
||||||
extern const int READONLY_SETTING = 472;
|
extern const int READONLY_SETTING = 472;
|
||||||
|
extern const int DEADLOCK_AVOIDED = 473;
|
||||||
|
extern const int INVALID_TEMPLATE_FORMAT = 474;
|
||||||
|
|
||||||
extern const int KEEPER_EXCEPTION = 999;
|
extern const int KEEPER_EXCEPTION = 999;
|
||||||
extern const int POCO_EXCEPTION = 1000;
|
extern const int POCO_EXCEPTION = 1000;
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#include <Common/CurrentMetrics.h>
|
#include <Common/CurrentMetrics.h>
|
||||||
#include <Common/ProfileEvents.h>
|
#include <Common/ProfileEvents.h>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
|
||||||
namespace ProfileEvents
|
namespace ProfileEvents
|
||||||
{
|
{
|
||||||
@ -29,6 +31,7 @@ namespace DB
|
|||||||
namespace ErrorCodes
|
namespace ErrorCodes
|
||||||
{
|
{
|
||||||
extern const int LOGICAL_ERROR;
|
extern const int LOGICAL_ERROR;
|
||||||
|
extern const int DEADLOCK_AVOIDED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -37,7 +40,6 @@ class RWLockImpl::LockHolderImpl
|
|||||||
RWLock parent;
|
RWLock parent;
|
||||||
GroupsContainer::iterator it_group;
|
GroupsContainer::iterator it_group;
|
||||||
ClientsContainer::iterator it_client;
|
ClientsContainer::iterator it_client;
|
||||||
ThreadToHolder::key_type thread_id;
|
|
||||||
QueryIdToHolder::key_type query_id;
|
QueryIdToHolder::key_type query_id;
|
||||||
CurrentMetrics::Increment active_client_increment;
|
CurrentMetrics::Increment active_client_increment;
|
||||||
|
|
||||||
@ -53,6 +55,44 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
/// Global information about all read locks that query has. It is needed to avoid some type of deadlocks.
|
||||||
|
|
||||||
|
class QueryLockInfo
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
std::mutex mutex;
|
||||||
|
std::map<std::string, size_t> queries;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void add(const String & query_id)
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mutex);
|
||||||
|
++queries[query_id];
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove(const String & query_id)
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mutex);
|
||||||
|
auto it = queries.find(query_id);
|
||||||
|
assert(it != queries.end());
|
||||||
|
if (--it->second == 0)
|
||||||
|
queries.erase(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
void check(const String & query_id)
|
||||||
|
{
|
||||||
|
std::lock_guard lock(mutex);
|
||||||
|
if (queries.count(query_id))
|
||||||
|
throw Exception("Possible deadlock avoided. Client should retry.", ErrorCodes::DEADLOCK_AVOIDED);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
QueryLockInfo all_read_locks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & query_id)
|
RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & query_id)
|
||||||
{
|
{
|
||||||
Stopwatch watch(CLOCK_MONOTONIC_COARSE);
|
Stopwatch watch(CLOCK_MONOTONIC_COARSE);
|
||||||
@ -69,34 +109,48 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
|
|||||||
GroupsContainer::iterator it_group;
|
GroupsContainer::iterator it_group;
|
||||||
ClientsContainer::iterator it_client;
|
ClientsContainer::iterator it_client;
|
||||||
|
|
||||||
|
/// This object is placed above unique_lock, because it may lock in destructor.
|
||||||
|
LockHolder res;
|
||||||
|
|
||||||
std::unique_lock lock(mutex);
|
std::unique_lock lock(mutex);
|
||||||
|
|
||||||
/// Check if the same query is acquiring previously acquired lock
|
/// Check if the same query is acquiring previously acquired lock
|
||||||
LockHolder existing_holder_ptr;
|
|
||||||
|
|
||||||
auto this_thread_id = std::this_thread::get_id();
|
|
||||||
auto it_thread = thread_to_holder.find(this_thread_id);
|
|
||||||
|
|
||||||
auto it_query = query_id_to_holder.end();
|
|
||||||
if (query_id != RWLockImpl::NO_QUERY)
|
if (query_id != RWLockImpl::NO_QUERY)
|
||||||
it_query = query_id_to_holder.find(query_id);
|
{
|
||||||
|
auto it_query = query_id_to_holder.find(query_id);
|
||||||
|
if (it_query != query_id_to_holder.end())
|
||||||
|
res = it_query->second.lock();
|
||||||
|
}
|
||||||
|
|
||||||
if (it_thread != thread_to_holder.end())
|
if (res)
|
||||||
existing_holder_ptr = it_thread->second.lock();
|
|
||||||
else if (it_query != query_id_to_holder.end())
|
|
||||||
existing_holder_ptr = it_query->second.lock();
|
|
||||||
|
|
||||||
if (existing_holder_ptr)
|
|
||||||
{
|
{
|
||||||
/// XXX: it means we can't upgrade lock from read to write - with proper waiting!
|
/// XXX: it means we can't upgrade lock from read to write - with proper waiting!
|
||||||
if (type != Read || existing_holder_ptr->it_group->type != Read)
|
if (type != Read || res->it_group->type != Read)
|
||||||
throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
else
|
||||||
return existing_holder_ptr;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** If the query already has any active read lock and tries to acquire another read lock
|
||||||
|
* but it is not in front of the queue and has to wait, deadlock is possible:
|
||||||
|
*
|
||||||
|
* Example (four queries, two RWLocks - 'a' and 'b'):
|
||||||
|
*
|
||||||
|
* --> time -->
|
||||||
|
*
|
||||||
|
* q1: ra rb
|
||||||
|
* q2: wa
|
||||||
|
* q3: rb ra
|
||||||
|
* q4: wb
|
||||||
|
*
|
||||||
|
* We will throw an exception instead.
|
||||||
|
*/
|
||||||
|
|
||||||
if (type == Type::Write || queue.empty() || queue.back().type == Type::Write)
|
if (type == Type::Write || queue.empty() || queue.back().type == Type::Write)
|
||||||
{
|
{
|
||||||
|
if (type == Type::Read && !queue.empty() && queue.back().type == Type::Write && query_id != RWLockImpl::NO_QUERY)
|
||||||
|
all_read_locks.check(query_id);
|
||||||
|
|
||||||
/// Create new group of clients
|
/// Create new group of clients
|
||||||
it_group = queue.emplace(queue.end(), type);
|
it_group = queue.emplace(queue.end(), type);
|
||||||
}
|
}
|
||||||
@ -104,6 +158,9 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
|
|||||||
{
|
{
|
||||||
/// Will append myself to last group
|
/// Will append myself to last group
|
||||||
it_group = std::prev(queue.end());
|
it_group = std::prev(queue.end());
|
||||||
|
|
||||||
|
if (it_group != queue.begin() && query_id != RWLockImpl::NO_QUERY)
|
||||||
|
all_read_locks.check(query_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Append myself to the end of chosen group
|
/// Append myself to the end of chosen group
|
||||||
@ -120,17 +177,19 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
|
|||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
|
||||||
LockHolder res(new LockHolderImpl(shared_from_this(), it_group, it_client));
|
res.reset(new LockHolderImpl(shared_from_this(), it_group, it_client));
|
||||||
|
|
||||||
/// Wait a notification until we will be the only in the group.
|
/// Wait a notification until we will be the only in the group.
|
||||||
it_group->cv.wait(lock, [&] () { return it_group == queue.begin(); });
|
it_group->cv.wait(lock, [&] () { return it_group == queue.begin(); });
|
||||||
|
|
||||||
/// Insert myself (weak_ptr to the holder) to threads set to implement recursive lock
|
/// Insert myself (weak_ptr to the holder) to queries set to implement recursive lock
|
||||||
thread_to_holder.emplace(this_thread_id, res);
|
|
||||||
res->thread_id = this_thread_id;
|
|
||||||
|
|
||||||
if (query_id != RWLockImpl::NO_QUERY)
|
if (query_id != RWLockImpl::NO_QUERY)
|
||||||
|
{
|
||||||
query_id_to_holder.emplace(query_id, res);
|
query_id_to_holder.emplace(query_id, res);
|
||||||
|
|
||||||
|
if (type == Type::Read)
|
||||||
|
all_read_locks.add(query_id);
|
||||||
|
}
|
||||||
res->query_id = query_id;
|
res->query_id = query_id;
|
||||||
|
|
||||||
finalize_metrics();
|
finalize_metrics();
|
||||||
@ -140,12 +199,14 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
|
|||||||
|
|
||||||
RWLockImpl::LockHolderImpl::~LockHolderImpl()
|
RWLockImpl::LockHolderImpl::~LockHolderImpl()
|
||||||
{
|
{
|
||||||
std::unique_lock lock(parent->mutex);
|
std::lock_guard lock(parent->mutex);
|
||||||
|
|
||||||
/// Remove weak_ptrs to the holder, since there are no owners of the current lock
|
/// Remove weak_ptrs to the holder, since there are no owners of the current lock
|
||||||
parent->thread_to_holder.erase(thread_id);
|
|
||||||
parent->query_id_to_holder.erase(query_id);
|
parent->query_id_to_holder.erase(query_id);
|
||||||
|
|
||||||
|
if (*it_client == RWLockImpl::Read && query_id != RWLockImpl::NO_QUERY)
|
||||||
|
all_read_locks.remove(query_id);
|
||||||
|
|
||||||
/// Removes myself from client list of our group
|
/// Removes myself from client list of our group
|
||||||
it_group->clients.erase(it_client);
|
it_group->clients.erase(it_client);
|
||||||
|
|
||||||
@ -166,6 +227,7 @@ RWLockImpl::LockHolderImpl::LockHolderImpl(RWLock && parent_, RWLockImpl::Groups
|
|||||||
: parent{std::move(parent_)}, it_group{it_group_}, it_client{it_client_},
|
: parent{std::move(parent_)}, it_group{it_group_}, it_client{it_client_},
|
||||||
active_client_increment{(*it_client == RWLockImpl::Read) ? CurrentMetrics::RWLockActiveReaders
|
active_client_increment{(*it_client == RWLockImpl::Read) ? CurrentMetrics::RWLockActiveReaders
|
||||||
: CurrentMetrics::RWLockActiveWriters}
|
: CurrentMetrics::RWLockActiveWriters}
|
||||||
{}
|
{
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <thread>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -19,7 +18,7 @@ using RWLock = std::shared_ptr<RWLockImpl>;
|
|||||||
|
|
||||||
|
|
||||||
/// Implements shared lock with FIFO service
|
/// Implements shared lock with FIFO service
|
||||||
/// Can be acquired recursively (several calls for the same query or the same OS thread) in Read mode
|
/// Can be acquired recursively (several calls for the same query) in Read mode
|
||||||
///
|
///
|
||||||
/// NOTE: it is important to allow acquiring the same lock in Read mode without waiting if it is already
|
/// NOTE: it is important to allow acquiring the same lock in Read mode without waiting if it is already
|
||||||
/// acquired by another thread of the same query. Otherwise the following deadlock is possible:
|
/// acquired by another thread of the same query. Otherwise the following deadlock is possible:
|
||||||
@ -55,7 +54,6 @@ private:
|
|||||||
struct Group;
|
struct Group;
|
||||||
using GroupsContainer = std::list<Group>;
|
using GroupsContainer = std::list<Group>;
|
||||||
using ClientsContainer = std::list<Type>;
|
using ClientsContainer = std::list<Type>;
|
||||||
using ThreadToHolder = std::map<std::thread::id, std::weak_ptr<LockHolderImpl>>;
|
|
||||||
using QueryIdToHolder = std::map<String, std::weak_ptr<LockHolderImpl>>;
|
using QueryIdToHolder = std::map<String, std::weak_ptr<LockHolderImpl>>;
|
||||||
|
|
||||||
/// Group of clients that should be executed concurrently
|
/// Group of clients that should be executed concurrently
|
||||||
@ -73,7 +71,6 @@ private:
|
|||||||
|
|
||||||
mutable std::mutex mutex;
|
mutable std::mutex mutex;
|
||||||
GroupsContainer queue;
|
GroupsContainer queue;
|
||||||
ThreadToHolder thread_to_holder;
|
|
||||||
QueryIdToHolder query_id_to_holder;
|
QueryIdToHolder query_id_to_holder;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -13,6 +13,14 @@
|
|||||||
|
|
||||||
using namespace DB;
|
using namespace DB;
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int DEADLOCK_AVOIDED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(Common, RWLock_1)
|
TEST(Common, RWLock_1)
|
||||||
{
|
{
|
||||||
@ -94,7 +102,7 @@ TEST(Common, RWLock_Recursive)
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < 2 * cycles; ++i)
|
for (int i = 0; i < 2 * cycles; ++i)
|
||||||
{
|
{
|
||||||
auto lock = fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY);
|
auto lock = fifo_lock->getLock(RWLockImpl::Write, "q1");
|
||||||
|
|
||||||
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
|
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
|
||||||
std::this_thread::sleep_for(sleep_for);
|
std::this_thread::sleep_for(sleep_for);
|
||||||
@ -105,17 +113,17 @@ TEST(Common, RWLock_Recursive)
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < cycles; ++i)
|
for (int i = 0; i < cycles; ++i)
|
||||||
{
|
{
|
||||||
auto lock1 = fifo_lock->getLock(RWLockImpl::Read, RWLockImpl::NO_QUERY);
|
auto lock1 = fifo_lock->getLock(RWLockImpl::Read, "q2");
|
||||||
|
|
||||||
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
|
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
|
||||||
std::this_thread::sleep_for(sleep_for);
|
std::this_thread::sleep_for(sleep_for);
|
||||||
|
|
||||||
auto lock2 = fifo_lock->getLock(RWLockImpl::Read, RWLockImpl::NO_QUERY);
|
auto lock2 = fifo_lock->getLock(RWLockImpl::Read, "q2");
|
||||||
|
|
||||||
EXPECT_ANY_THROW({fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY);});
|
EXPECT_ANY_THROW({fifo_lock->getLock(RWLockImpl::Write, "q2");});
|
||||||
}
|
}
|
||||||
|
|
||||||
fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY);
|
fifo_lock->getLock(RWLockImpl::Write, "q2");
|
||||||
});
|
});
|
||||||
|
|
||||||
t1.join();
|
t1.join();
|
||||||
@ -123,6 +131,74 @@ TEST(Common, RWLock_Recursive)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(Common, RWLock_Deadlock)
|
||||||
|
{
|
||||||
|
static auto lock1 = RWLockImpl::create();
|
||||||
|
static auto lock2 = RWLockImpl::create();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* q1: r1 r2
|
||||||
|
* q2: w1
|
||||||
|
* q3: r2 r1
|
||||||
|
* q4: w2
|
||||||
|
*/
|
||||||
|
|
||||||
|
std::thread t1([&] ()
|
||||||
|
{
|
||||||
|
auto holder1 = lock1->getLock(RWLockImpl::Read, "q1");
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
auto holder2 = lock2->getLock(RWLockImpl::Read, "q1");
|
||||||
|
}
|
||||||
|
catch (const Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != ErrorCodes::DEADLOCK_AVOIDED)
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
std::thread t2([&] ()
|
||||||
|
{
|
||||||
|
usleep(100000);
|
||||||
|
auto holder1 = lock1->getLock(RWLockImpl::Write, "q2");
|
||||||
|
});
|
||||||
|
|
||||||
|
std::thread t3([&] ()
|
||||||
|
{
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
auto holder2 = lock2->getLock(RWLockImpl::Read, "q3");
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
auto holder1 = lock1->getLock(RWLockImpl::Read, "q3");
|
||||||
|
}
|
||||||
|
catch (const Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != ErrorCodes::DEADLOCK_AVOIDED)
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
std::thread t4([&] ()
|
||||||
|
{
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
usleep(100000);
|
||||||
|
auto holder2 = lock2->getLock(RWLockImpl::Write, "q4");
|
||||||
|
});
|
||||||
|
|
||||||
|
t1.join();
|
||||||
|
t2.join();
|
||||||
|
t3.join();
|
||||||
|
t4.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(Common, RWLock_PerfTest_Readers)
|
TEST(Common, RWLock_PerfTest_Readers)
|
||||||
{
|
{
|
||||||
constexpr int cycles = 100000; // 100k
|
constexpr int cycles = 100000; // 100k
|
||||||
|
@ -216,6 +216,8 @@ struct Settings : public SettingsCollection<Settings>
|
|||||||
M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \
|
M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \
|
||||||
M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \
|
M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \
|
||||||
M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \
|
M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \
|
||||||
|
M(SettingString, format_schema_rows, "", "Row format string for Template format") \
|
||||||
|
M(SettingString, format_schema_rows_between_delimiter, "\n", "Delimiter between rows for Template format") \
|
||||||
M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \
|
M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \
|
||||||
M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \
|
M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \
|
||||||
M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \
|
M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \
|
||||||
|
@ -1,178 +0,0 @@
|
|||||||
#include <Common/Exception.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
#include <common/logger_useful.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
||||||
extern const int CANNOT_PARSE_QUOTED_STRING;
|
|
||||||
extern const int CANNOT_PARSE_DATE;
|
|
||||||
extern const int CANNOT_PARSE_DATETIME;
|
|
||||||
extern const int CANNOT_READ_ARRAY_FROM_TEXT;
|
|
||||||
extern const int CANNOT_PARSE_NUMBER;
|
|
||||||
extern const int CANNOT_PARSE_UUID;
|
|
||||||
extern const int TOO_LARGE_STRING_SIZE;
|
|
||||||
extern const int CANNOT_READ_ALL_DATA;
|
|
||||||
extern const int INCORRECT_DATA;
|
|
||||||
extern const int INCORRECT_NUMBER_OF_COLUMNS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
BlockInputStreamFromRowInputStream::BlockInputStreamFromRowInputStream(
|
|
||||||
const RowInputStreamPtr & row_input_,
|
|
||||||
const Block & sample_,
|
|
||||||
UInt64 max_block_size_,
|
|
||||||
UInt64 rows_portion_size_,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
: row_input(row_input_)
|
|
||||||
, sample(sample_)
|
|
||||||
, max_block_size(max_block_size_)
|
|
||||||
, rows_portion_size(rows_portion_size_)
|
|
||||||
, read_virtual_columns_callback(callback)
|
|
||||||
, allow_errors_num(settings.input_allow_errors_num)
|
|
||||||
, allow_errors_ratio(settings.input_allow_errors_ratio)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static bool isParseError(int code)
|
|
||||||
{
|
|
||||||
return code == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED
|
|
||||||
|| code == ErrorCodes::CANNOT_PARSE_QUOTED_STRING
|
|
||||||
|| code == ErrorCodes::CANNOT_PARSE_DATE
|
|
||||||
|| code == ErrorCodes::CANNOT_PARSE_DATETIME
|
|
||||||
|| code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
|
|
||||||
|| code == ErrorCodes::CANNOT_PARSE_NUMBER
|
|
||||||
|| code == ErrorCodes::CANNOT_PARSE_UUID
|
|
||||||
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE
|
|
||||||
|| code == ErrorCodes::CANNOT_READ_ALL_DATA
|
|
||||||
|| code == ErrorCodes::INCORRECT_DATA;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Block BlockInputStreamFromRowInputStream::readImpl()
|
|
||||||
{
|
|
||||||
size_t num_columns = sample.columns();
|
|
||||||
MutableColumns columns = sample.cloneEmptyColumns();
|
|
||||||
block_missing_values.clear();
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
for (size_t rows = 0, batch = 0; rows < max_block_size; ++rows, ++batch)
|
|
||||||
{
|
|
||||||
if (rows_portion_size && batch == rows_portion_size)
|
|
||||||
{
|
|
||||||
batch = 0;
|
|
||||||
if (!checkTimeLimit() || isCancelled())
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
++total_rows;
|
|
||||||
RowReadExtension info_;
|
|
||||||
if (!row_input->read(columns, info_))
|
|
||||||
break;
|
|
||||||
if (read_virtual_columns_callback)
|
|
||||||
read_virtual_columns_callback();
|
|
||||||
|
|
||||||
for (size_t column_idx = 0; column_idx < info_.read_columns.size(); ++column_idx)
|
|
||||||
{
|
|
||||||
if (!info_.read_columns[column_idx])
|
|
||||||
{
|
|
||||||
size_t column_size = columns[column_idx]->size();
|
|
||||||
if (column_size == 0)
|
|
||||||
throw Exception("Unexpected empty column", ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS);
|
|
||||||
block_missing_values.setBit(column_idx, column_size - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception & e)
|
|
||||||
{
|
|
||||||
/// Logic for possible skipping of errors.
|
|
||||||
|
|
||||||
if (!isParseError(e.code()))
|
|
||||||
throw;
|
|
||||||
|
|
||||||
if (allow_errors_num == 0 && allow_errors_ratio == 0)
|
|
||||||
throw;
|
|
||||||
|
|
||||||
++num_errors;
|
|
||||||
Float32 current_error_ratio = static_cast<Float32>(num_errors) / total_rows;
|
|
||||||
|
|
||||||
if (num_errors > allow_errors_num
|
|
||||||
&& current_error_ratio > allow_errors_ratio)
|
|
||||||
{
|
|
||||||
e.addMessage("(Already have " + toString(num_errors) + " errors"
|
|
||||||
" out of " + toString(total_rows) + " rows"
|
|
||||||
", which is " + toString(current_error_ratio) + " of all rows)");
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!row_input->allowSyncAfterError())
|
|
||||||
{
|
|
||||||
e.addMessage("(Input format doesn't allow to skip errors)");
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
row_input->syncAfterError();
|
|
||||||
|
|
||||||
/// Truncate all columns in block to minimal size (remove values, that was appended to only part of columns).
|
|
||||||
|
|
||||||
size_t min_size = std::numeric_limits<size_t>::max();
|
|
||||||
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
|
|
||||||
min_size = std::min(min_size, columns[column_idx]->size());
|
|
||||||
|
|
||||||
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
|
|
||||||
{
|
|
||||||
auto & column = columns[column_idx];
|
|
||||||
if (column->size() > min_size)
|
|
||||||
column->popBack(column->size() - min_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception & e)
|
|
||||||
{
|
|
||||||
if (!isParseError(e.code()))
|
|
||||||
throw;
|
|
||||||
|
|
||||||
String verbose_diagnostic;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
verbose_diagnostic = row_input->getDiagnosticInfo();
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
|
|
||||||
}
|
|
||||||
|
|
||||||
e.addMessage("(at row " + toString(total_rows) + ")\n" + verbose_diagnostic);
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (columns.empty() || columns[0]->empty())
|
|
||||||
return {};
|
|
||||||
|
|
||||||
return sample.cloneWithColumns(std::move(columns));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void BlockInputStreamFromRowInputStream::readSuffix()
|
|
||||||
{
|
|
||||||
if (allow_errors_num > 0 || allow_errors_ratio > 0)
|
|
||||||
{
|
|
||||||
Logger * log = &Logger::get("BlockInputStreamFromRowInputStream");
|
|
||||||
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
|
|
||||||
}
|
|
||||||
|
|
||||||
row_input->readSuffix();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,62 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Defines.h>
|
|
||||||
#include <DataStreams/IBlockInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** Makes block-oriented stream on top of row-oriented stream.
|
|
||||||
* It is used to read data from text formats.
|
|
||||||
*
|
|
||||||
* Also controls over parsing errors and prints diagnostic information about them.
|
|
||||||
*/
|
|
||||||
class BlockInputStreamFromRowInputStream : public IBlockInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/// |sample| is a block with zero rows, that structure describes how to interpret values
|
|
||||||
/// |rows_portion_size| is a number of rows to read before break and check limits
|
|
||||||
BlockInputStreamFromRowInputStream(
|
|
||||||
const RowInputStreamPtr & row_input_,
|
|
||||||
const Block & sample_,
|
|
||||||
UInt64 max_block_size_,
|
|
||||||
UInt64 rows_portion_size_,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings);
|
|
||||||
|
|
||||||
void readPrefix() override { row_input->readPrefix(); }
|
|
||||||
void readSuffix() override;
|
|
||||||
|
|
||||||
String getName() const override { return "BlockInputStreamFromRowInputStream"; }
|
|
||||||
|
|
||||||
RowInputStreamPtr & getRowInput() { return row_input; }
|
|
||||||
|
|
||||||
Block getHeader() const override { return sample; }
|
|
||||||
|
|
||||||
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
Block readImpl() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
RowInputStreamPtr row_input;
|
|
||||||
Block sample;
|
|
||||||
UInt64 max_block_size;
|
|
||||||
UInt64 rows_portion_size;
|
|
||||||
|
|
||||||
/// Callback used to setup virtual columns after reading each row.
|
|
||||||
FormatFactory::ReadCallback read_virtual_columns_callback;
|
|
||||||
|
|
||||||
BlockMissingValues block_missing_values;
|
|
||||||
|
|
||||||
UInt64 allow_errors_num;
|
|
||||||
Float32 allow_errors_ratio;
|
|
||||||
|
|
||||||
size_t total_rows = 0;
|
|
||||||
size_t num_errors = 0;
|
|
||||||
};
|
|
||||||
}
|
|
@ -1,573 +0,0 @@
|
|||||||
#include <Core/Defines.h>
|
|
||||||
|
|
||||||
#include <IO/ConcatReadBuffer.h>
|
|
||||||
#include <IO/ReadHelpers.h>
|
|
||||||
#include <IO/Operators.h>
|
|
||||||
|
|
||||||
#include <Formats/verbosePrintString.h>
|
|
||||||
#include <Formats/CSVRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
#include <DataTypes/DataTypeNullable.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int INCORRECT_DATA;
|
|
||||||
extern const int LOGICAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static inline void skipEndOfLine(ReadBuffer & istr)
|
|
||||||
{
|
|
||||||
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
|
||||||
|
|
||||||
if (*istr.position() == '\n')
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
if (!istr.eof() && *istr.position() == '\r')
|
|
||||||
++istr.position();
|
|
||||||
}
|
|
||||||
else if (*istr.position() == '\r')
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
if (!istr.eof() && *istr.position() == '\n')
|
|
||||||
++istr.position();
|
|
||||||
else
|
|
||||||
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
|
|
||||||
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
|
|
||||||
}
|
|
||||||
else if (!istr.eof())
|
|
||||||
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static inline void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column)
|
|
||||||
{
|
|
||||||
if (is_last_column)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return;
|
|
||||||
|
|
||||||
/// we support the extra delimiter at the end of the line
|
|
||||||
if (*istr.position() == delimiter)
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
if (istr.eof())
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
skipEndOfLine(istr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
assertChar(delimiter, istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Skip `whitespace` symbols allowed in CSV.
|
|
||||||
static inline void skipWhitespacesAndTabs(ReadBuffer & buf)
|
|
||||||
{
|
|
||||||
while (!buf.eof()
|
|
||||||
&& (*buf.position() == ' '
|
|
||||||
|| *buf.position() == '\t'))
|
|
||||||
++buf.position();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns)
|
|
||||||
{
|
|
||||||
String tmp;
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
readCSVString(tmp, istr, settings);
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
|
|
||||||
skipDelimiter(istr, settings.delimiter, i + 1 == num_columns);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_)
|
|
||||||
: istr(istr_), header(header_), with_names(with_names_), format_settings(format_settings_)
|
|
||||||
{
|
|
||||||
const auto num_columns = header.columns();
|
|
||||||
|
|
||||||
data_types.resize(num_columns);
|
|
||||||
column_indexes_by_names.reserve(num_columns);
|
|
||||||
column_idx_to_nullable_column_idx.resize(num_columns);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
const auto & column_info = header.getByPosition(i);
|
|
||||||
|
|
||||||
data_types[i] = column_info.type;
|
|
||||||
column_indexes_by_names.emplace(column_info.name, i);
|
|
||||||
|
|
||||||
/// If input_format_null_as_default=1 we need ColumnNullable of type DataTypeNullable(nested_type)
|
|
||||||
/// to parse value as nullable before inserting it in corresponding column of not-nullable type.
|
|
||||||
/// Constructing temporary column for each row is slow, so we prepare it here
|
|
||||||
if (format_settings.csv.null_as_default && !column_info.type->isNullable() && column_info.type->canBeInsideNullable())
|
|
||||||
{
|
|
||||||
column_idx_to_nullable_column_idx[i] = nullable_columns.size();
|
|
||||||
nullable_types.emplace_back(std::make_shared<DataTypeNullable>(column_info.type));
|
|
||||||
nullable_columns.emplace_back(nullable_types.back()->createColumn());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Map an input file column to a table column, based on its name.
|
|
||||||
void CSVRowInputStream::addInputColumn(const String & column_name)
|
|
||||||
{
|
|
||||||
const auto column_it = column_indexes_by_names.find(column_name);
|
|
||||||
if (column_it == column_indexes_by_names.end())
|
|
||||||
{
|
|
||||||
if (format_settings.skip_unknown_fields)
|
|
||||||
{
|
|
||||||
column_indexes_for_input_fields.push_back(std::nullopt);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw Exception(
|
|
||||||
"Unknown field found in CSV header: '" + column_name + "' " +
|
|
||||||
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
|
|
||||||
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
|
|
||||||
ErrorCodes::INCORRECT_DATA
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto column_index = column_it->second;
|
|
||||||
|
|
||||||
if (read_columns[column_index])
|
|
||||||
throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
read_columns[column_index] = true;
|
|
||||||
column_indexes_for_input_fields.emplace_back(column_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CSVRowInputStream::readPrefix()
|
|
||||||
{
|
|
||||||
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
|
|
||||||
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
|
|
||||||
skipBOMIfExists(istr);
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
/// This CSV file has a header row with column names. Depending on the
|
|
||||||
/// settings, use it or skip it.
|
|
||||||
if (format_settings.with_names_use_header)
|
|
||||||
{
|
|
||||||
/// Look at the file header to see which columns we have there.
|
|
||||||
/// The missing columns are filled with defaults.
|
|
||||||
read_columns.assign(header.columns(), false);
|
|
||||||
do
|
|
||||||
{
|
|
||||||
String column_name;
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
readCSVString(column_name, istr, format_settings.csv);
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
|
|
||||||
addInputColumn(column_name);
|
|
||||||
}
|
|
||||||
while (checkChar(format_settings.csv.delimiter, istr));
|
|
||||||
|
|
||||||
skipDelimiter(istr, format_settings.csv.delimiter, true);
|
|
||||||
|
|
||||||
for (size_t column = 0; column < read_columns.size(); column++)
|
|
||||||
{
|
|
||||||
if (!read_columns[column])
|
|
||||||
{
|
|
||||||
have_always_default_columns = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
skipRow(istr, format_settings.csv, header.columns());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The default: map each column of the file to the column of the table with
|
|
||||||
/// the same index.
|
|
||||||
read_columns.assign(header.columns(), true);
|
|
||||||
column_indexes_for_input_fields.resize(header.columns());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
|
|
||||||
{
|
|
||||||
column_indexes_for_input_fields[i] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If you change this function, don't forget to change its counterpart
|
|
||||||
* with extended error reporting: parseRowAndPrintDiagnosticInfo().
|
|
||||||
*/
|
|
||||||
bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
updateDiagnosticInfo();
|
|
||||||
|
|
||||||
/// Track whether we have to fill any columns in this row with default
|
|
||||||
/// values. If not, we return an empty column mask to the caller, so that
|
|
||||||
/// it doesn't have to check it.
|
|
||||||
bool have_default_columns = have_always_default_columns;
|
|
||||||
|
|
||||||
const auto delimiter = format_settings.csv.delimiter;
|
|
||||||
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
|
||||||
{
|
|
||||||
const auto & table_column = column_indexes_for_input_fields[file_column];
|
|
||||||
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
|
|
||||||
|
|
||||||
if (table_column)
|
|
||||||
{
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column],
|
|
||||||
is_last_file_column, *table_column);
|
|
||||||
if (!read_columns[*table_column])
|
|
||||||
have_default_columns = true;
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// We never read this column from the file, just skip it.
|
|
||||||
String tmp;
|
|
||||||
readCSVString(tmp, istr, format_settings.csv);
|
|
||||||
}
|
|
||||||
|
|
||||||
skipDelimiter(istr, delimiter, is_last_file_column);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (have_default_columns)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < read_columns.size(); i++)
|
|
||||||
{
|
|
||||||
if (!read_columns[i])
|
|
||||||
{
|
|
||||||
/// The column value for this row is going to be overwritten
|
|
||||||
/// with default by the caller, but the general assumption is
|
|
||||||
/// that the column size increases for each row, so we have
|
|
||||||
/// to insert something. Since we do not care about the exact
|
|
||||||
/// value, we do not have to use the default value specified by
|
|
||||||
/// the data type, and can just use IColumn::insertDefault().
|
|
||||||
columns[i]->insertDefault();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ext.read_columns = read_columns;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
String CSVRowInputStream::getDiagnosticInfo()
|
|
||||||
{
|
|
||||||
if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
|
||||||
return {};
|
|
||||||
|
|
||||||
WriteBufferFromOwnString out;
|
|
||||||
|
|
||||||
MutableColumns columns = header.cloneEmptyColumns();
|
|
||||||
|
|
||||||
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
|
|
||||||
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t max_length_of_column_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
|
|
||||||
max_length_of_column_name = header.safeGetByPosition(i).name.size();
|
|
||||||
|
|
||||||
size_t max_length_of_data_type_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
|
||||||
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
|
|
||||||
|
|
||||||
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
|
|
||||||
|
|
||||||
if (pos_of_prev_row)
|
|
||||||
{
|
|
||||||
istr.position() = pos_of_prev_row;
|
|
||||||
|
|
||||||
out << "\nRow " << (row_num - 1) << ":\n";
|
|
||||||
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!pos_of_current_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
istr.position() = pos_of_current_row;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\nRow " << row_num << ":\n";
|
|
||||||
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** gcc-7 generates wrong code with optimization level greater than 1.
|
|
||||||
* See tests: dbms/src/IO/tests/write_int.cpp
|
|
||||||
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
|
|
||||||
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
|
|
||||||
* Nevertheless, we don't need high optimization of this function.
|
|
||||||
*/
|
|
||||||
bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
|
||||||
{
|
|
||||||
const char delimiter = format_settings.csv.delimiter;
|
|
||||||
|
|
||||||
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
|
||||||
{
|
|
||||||
if (file_column == 0 && istr.eof())
|
|
||||||
{
|
|
||||||
out << "<End of stream>\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (column_indexes_for_input_fields[file_column].has_value())
|
|
||||||
{
|
|
||||||
const auto & table_column = *column_indexes_for_input_fields[file_column];
|
|
||||||
const auto & current_column_type = data_types[table_column];
|
|
||||||
const bool is_last_file_column =
|
|
||||||
file_column + 1 == column_indexes_for_input_fields.size();
|
|
||||||
const bool at_delimiter = !istr.eof() && *istr.position() == delimiter;
|
|
||||||
const bool at_last_column_line_end = is_last_file_column
|
|
||||||
&& (istr.eof() || *istr.position() == '\n' || *istr.position() == '\r');
|
|
||||||
|
|
||||||
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
|
||||||
<< "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ')
|
|
||||||
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
|
|
||||||
|
|
||||||
if (format_settings.csv.empty_as_default
|
|
||||||
&& (at_delimiter || at_last_column_line_end))
|
|
||||||
{
|
|
||||||
columns[table_column]->insertDefault();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
BufferBase::Position prev_position = istr.position();
|
|
||||||
BufferBase::Position curr_position = istr.position();
|
|
||||||
std::exception_ptr exception;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
prev_position = istr.position();
|
|
||||||
readField(*columns[table_column], current_column_type, is_last_file_column, table_column);
|
|
||||||
curr_position = istr.position();
|
|
||||||
skipWhitespacesAndTabs(istr);
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
exception = std::current_exception();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (curr_position < prev_position)
|
|
||||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
|
|
||||||
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
|
|
||||||
{
|
|
||||||
/// An empty string instead of a value.
|
|
||||||
if (curr_position == prev_position)
|
|
||||||
{
|
|
||||||
out << "ERROR: text ";
|
|
||||||
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
|
|
||||||
out << " is not like " << current_column_type->getName() << "\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "parsed text: ";
|
|
||||||
verbosePrintString(prev_position, curr_position, out);
|
|
||||||
|
|
||||||
if (exception)
|
|
||||||
{
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
else
|
|
||||||
out << "ERROR\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->haveMaximumSizeOfValue()
|
|
||||||
&& *curr_position != '\n' && *curr_position != '\r'
|
|
||||||
&& *curr_position != delimiter)
|
|
||||||
{
|
|
||||||
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
|
|
||||||
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
|
||||||
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
|
||||||
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
|
|
||||||
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
|
|
||||||
|
|
||||||
String tmp;
|
|
||||||
readCSVString(tmp, istr, format_settings.csv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Delimiters
|
|
||||||
if (file_column + 1 == column_indexes_for_input_fields.size())
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/// we support the extra delimiter at the end of the line
|
|
||||||
if (*istr.position() == delimiter)
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
if (istr.eof())
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!istr.eof() && *istr.position() != '\n' && *istr.position() != '\r')
|
|
||||||
{
|
|
||||||
out << "ERROR: There is no line feed. ";
|
|
||||||
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
||||||
out << " found instead.\n"
|
|
||||||
" It's like your file has more columns than expected.\n"
|
|
||||||
"And if your file have right number of columns, maybe it have unquoted string value with comma.\n";
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
skipEndOfLine(istr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
assertChar(delimiter, istr);
|
|
||||||
}
|
|
||||||
catch (const DB::Exception &)
|
|
||||||
{
|
|
||||||
if (*istr.position() == '\n' || *istr.position() == '\r')
|
|
||||||
{
|
|
||||||
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
|
|
||||||
" It's like your file has less columns than expected.\n"
|
|
||||||
"And if your file have right number of columns, maybe it have unescaped quotes in values.\n";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
out << "ERROR: There is no delimiter (" << delimiter << "). ";
|
|
||||||
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
||||||
out << " found instead.\n";
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowInputStream::syncAfterError()
|
|
||||||
{
|
|
||||||
skipToNextLineOrEOF(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CSVRowInputStream::updateDiagnosticInfo()
|
|
||||||
{
|
|
||||||
++row_num;
|
|
||||||
|
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
|
||||||
bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
|
|
||||||
|
|
||||||
pos_of_prev_row = pos_of_current_row;
|
|
||||||
pos_of_current_row = istr.position();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CSVRowInputStream::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
|
|
||||||
{
|
|
||||||
const bool at_delimiter = !istr.eof() || *istr.position() == format_settings.csv.delimiter;
|
|
||||||
const bool at_last_column_line_end = is_last_file_column
|
|
||||||
&& (istr.eof() || *istr.position() == '\n' || *istr.position() == '\r');
|
|
||||||
|
|
||||||
if (format_settings.csv.empty_as_default
|
|
||||||
&& (at_delimiter || at_last_column_line_end))
|
|
||||||
{
|
|
||||||
/// Treat empty unquoted column value as default value, if
|
|
||||||
/// specified in the settings. Tuple columns might seem
|
|
||||||
/// problematic, because they are never quoted but still contain
|
|
||||||
/// commas, which might be also used as delimiters. However,
|
|
||||||
/// they do not contain empty unquoted fields, so this check
|
|
||||||
/// works for tuples as well.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else if (column_idx_to_nullable_column_idx[column_idx])
|
|
||||||
{
|
|
||||||
/// If value is null but type is not nullable then use default value instead.
|
|
||||||
const size_t nullable_idx = *column_idx_to_nullable_column_idx[column_idx];
|
|
||||||
auto & tmp_col = *nullable_columns[nullable_idx];
|
|
||||||
nullable_types[nullable_idx]->deserializeAsTextCSV(tmp_col, istr, format_settings);
|
|
||||||
Field value = tmp_col[0];
|
|
||||||
tmp_col.popBack(1); /// do not store copy of values in memory
|
|
||||||
if (value.isNull())
|
|
||||||
return false;
|
|
||||||
column.insert(value);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// Read the column normally.
|
|
||||||
type->deserializeAsTextCSV(column, istr, format_settings);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatCSV(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
for (bool with_names : {false, true})
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<CSVRowInputStream>(buf, sample, with_names, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,83 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <optional>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
/** A stream for inputting data in csv format.
|
|
||||||
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
|
|
||||||
*/
|
|
||||||
class CSVRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/** with_names - in the first line the header with column names
|
|
||||||
*/
|
|
||||||
CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension & ext) override;
|
|
||||||
void readPrefix() override;
|
|
||||||
bool allowSyncAfterError() const override { return true; }
|
|
||||||
void syncAfterError() override;
|
|
||||||
|
|
||||||
std::string getDiagnosticInfo() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
bool with_names;
|
|
||||||
DataTypes data_types;
|
|
||||||
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
using IndexesMap = std::unordered_map<String, size_t>;
|
|
||||||
IndexesMap column_indexes_by_names;
|
|
||||||
|
|
||||||
/// Maps indexes of columns in the input file to indexes of table columns
|
|
||||||
using OptionalIndexes = std::vector<std::optional<size_t>>;
|
|
||||||
OptionalIndexes column_indexes_for_input_fields;
|
|
||||||
|
|
||||||
/// Tracks which colums we have read in a single read() call.
|
|
||||||
/// For columns that are never read, it is initialized to false when we
|
|
||||||
/// read the file header, and never changed afterwards.
|
|
||||||
/// For other columns, it is updated on each read() call.
|
|
||||||
std::vector<UInt8> read_columns;
|
|
||||||
|
|
||||||
/// Whether we have any columns that are not read from file at all,
|
|
||||||
/// and must be always initialized with defaults.
|
|
||||||
bool have_always_default_columns = false;
|
|
||||||
|
|
||||||
void addInputColumn(const String & column_name);
|
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
|
||||||
size_t row_num = 0;
|
|
||||||
|
|
||||||
/// How many bytes were read, not counting those that are still in the buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
|
|
||||||
|
|
||||||
char * pos_of_current_row = nullptr;
|
|
||||||
char * pos_of_prev_row = nullptr;
|
|
||||||
|
|
||||||
/// For setting input_format_null_as_default
|
|
||||||
DataTypes nullable_types;
|
|
||||||
MutableColumns nullable_columns;
|
|
||||||
OptionalIndexes column_idx_to_nullable_column_idx;
|
|
||||||
|
|
||||||
void updateDiagnosticInfo();
|
|
||||||
|
|
||||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
|
||||||
|
|
||||||
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -47,6 +47,9 @@ static FormatSettings getInputFormatSetting(const Settings & settings)
|
|||||||
format_settings.date_time_input_format = settings.date_time_input_format;
|
format_settings.date_time_input_format = settings.date_time_input_format;
|
||||||
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
|
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
|
||||||
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
|
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
|
||||||
|
format_settings.template_settings.format = settings.format_schema;
|
||||||
|
format_settings.template_settings.row_format = settings.format_schema_rows;
|
||||||
|
format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter;
|
||||||
|
|
||||||
return format_settings;
|
return format_settings;
|
||||||
}
|
}
|
||||||
@ -63,6 +66,9 @@ static FormatSettings getOutputFormatSetting(const Settings & settings)
|
|||||||
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
|
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
|
||||||
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
||||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||||
|
format_settings.template_settings.format = settings.format_schema;
|
||||||
|
format_settings.template_settings.row_format = settings.format_schema_rows;
|
||||||
|
format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter;
|
||||||
format_settings.write_statistics = settings.output_format_write_statistics;
|
format_settings.write_statistics = settings.output_format_write_statistics;
|
||||||
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
|
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
|
||||||
|
|
||||||
@ -220,8 +226,6 @@ void FormatFactory::registerOutputFormatProcessor(const String & name, OutputPro
|
|||||||
|
|
||||||
void registerInputFormatNative(FormatFactory & factory);
|
void registerInputFormatNative(FormatFactory & factory);
|
||||||
void registerOutputFormatNative(FormatFactory & factory);
|
void registerOutputFormatNative(FormatFactory & factory);
|
||||||
void registerInputFormatTabSeparated(FormatFactory & factory);
|
|
||||||
void registerInputFormatCSV(FormatFactory & factory);
|
|
||||||
|
|
||||||
void registerInputFormatProcessorNative(FormatFactory & factory);
|
void registerInputFormatProcessorNative(FormatFactory & factory);
|
||||||
void registerOutputFormatProcessorNative(FormatFactory & factory);
|
void registerOutputFormatProcessorNative(FormatFactory & factory);
|
||||||
@ -242,6 +246,8 @@ void registerInputFormatProcessorORC(FormatFactory & factory);
|
|||||||
void registerOutputFormatProcessorParquet(FormatFactory & factory);
|
void registerOutputFormatProcessorParquet(FormatFactory & factory);
|
||||||
void registerInputFormatProcessorProtobuf(FormatFactory & factory);
|
void registerInputFormatProcessorProtobuf(FormatFactory & factory);
|
||||||
void registerOutputFormatProcessorProtobuf(FormatFactory & factory);
|
void registerOutputFormatProcessorProtobuf(FormatFactory & factory);
|
||||||
|
void registerInputFormatProcessorTemplate(FormatFactory & factory);
|
||||||
|
void registerOutputFormatProcessorTemplate(FormatFactory &factory);
|
||||||
|
|
||||||
/// Output only (presentational) formats.
|
/// Output only (presentational) formats.
|
||||||
|
|
||||||
@ -267,8 +273,6 @@ FormatFactory::FormatFactory()
|
|||||||
{
|
{
|
||||||
registerInputFormatNative(*this);
|
registerInputFormatNative(*this);
|
||||||
registerOutputFormatNative(*this);
|
registerOutputFormatNative(*this);
|
||||||
registerInputFormatTabSeparated(*this);
|
|
||||||
registerInputFormatCSV(*this);
|
|
||||||
|
|
||||||
registerOutputFormatProcessorJSONEachRowWithProgress(*this);
|
registerOutputFormatProcessorJSONEachRowWithProgress(*this);
|
||||||
|
|
||||||
@ -292,6 +296,8 @@ FormatFactory::FormatFactory()
|
|||||||
registerInputFormatProcessorORC(*this);
|
registerInputFormatProcessorORC(*this);
|
||||||
registerInputFormatProcessorParquet(*this);
|
registerInputFormatProcessorParquet(*this);
|
||||||
registerOutputFormatProcessorParquet(*this);
|
registerOutputFormatProcessorParquet(*this);
|
||||||
|
registerInputFormatProcessorTemplate(*this);
|
||||||
|
registerOutputFormatProcessorTemplate(*this);
|
||||||
|
|
||||||
|
|
||||||
registerOutputFormatNull(*this);
|
registerOutputFormatNull(*this);
|
||||||
|
@ -50,6 +50,15 @@ struct FormatSettings
|
|||||||
|
|
||||||
Values values;
|
Values values;
|
||||||
|
|
||||||
|
struct Template
|
||||||
|
{
|
||||||
|
String format;
|
||||||
|
String row_format;
|
||||||
|
String row_between_delimiter;
|
||||||
|
};
|
||||||
|
|
||||||
|
Template template_settings;
|
||||||
|
|
||||||
bool skip_unknown_fields = false;
|
bool skip_unknown_fields = false;
|
||||||
bool with_names_use_header = false;
|
bool with_names_use_header = false;
|
||||||
bool write_statistics = true;
|
bool write_statistics = true;
|
||||||
|
217
dbms/src/Formats/ParsedTemplateFormatString.cpp
Normal file
217
dbms/src/Formats/ParsedTemplateFormatString.cpp
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
#include <Formats/ParsedTemplateFormatString.h>
|
||||||
|
#include <Formats/verbosePrintString.h>
|
||||||
|
#include <IO/ReadBufferFromMemory.h>
|
||||||
|
#include <IO/Operators.h>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int INVALID_TEMPLATE_FORMAT;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
parse(format_string, idx_by_name);
|
||||||
|
}
|
||||||
|
catch (DB::Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT)
|
||||||
|
throwInvalidFormat(e.message(), columnsCount());
|
||||||
|
else
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name)
|
||||||
|
{
|
||||||
|
enum ParserState
|
||||||
|
{
|
||||||
|
Delimiter,
|
||||||
|
Column,
|
||||||
|
Format
|
||||||
|
};
|
||||||
|
|
||||||
|
const char * pos = format_string.c_str();
|
||||||
|
const char * end = format_string.c_str() + format_string.size();
|
||||||
|
const char * token_begin = pos;
|
||||||
|
ParserState state = Delimiter;
|
||||||
|
delimiters.emplace_back();
|
||||||
|
for (; *pos; ++pos)
|
||||||
|
{
|
||||||
|
switch (state)
|
||||||
|
{
|
||||||
|
case Delimiter:
|
||||||
|
if (*pos == '$')
|
||||||
|
{
|
||||||
|
delimiters.back().append(token_begin, pos - token_begin);
|
||||||
|
++pos;
|
||||||
|
if (*pos == '{')
|
||||||
|
{
|
||||||
|
token_begin = pos + 1;
|
||||||
|
state = Column;
|
||||||
|
}
|
||||||
|
else if (*pos == '$')
|
||||||
|
{
|
||||||
|
token_begin = pos;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throwInvalidFormat("at pos " + std::to_string(pos - format_string.c_str()) +
|
||||||
|
": expected '{' or '$' after '$'", columnsCount());
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Column:
|
||||||
|
column_names.emplace_back();
|
||||||
|
pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back());
|
||||||
|
|
||||||
|
if (*pos == ':')
|
||||||
|
state = Format;
|
||||||
|
else if (*pos == '}')
|
||||||
|
{
|
||||||
|
formats.push_back(ColumnFormat::None);
|
||||||
|
delimiters.emplace_back();
|
||||||
|
state = Delimiter;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throwInvalidFormat("Expected ':' or '}' after column name: \"" + column_names.back() + "\"", columnsCount());
|
||||||
|
|
||||||
|
token_begin = pos + 1;
|
||||||
|
format_idx_to_column_idx.emplace_back(idx_by_name(column_names.back()));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Format:
|
||||||
|
if (*pos == '}')
|
||||||
|
{
|
||||||
|
formats.push_back(stringToFormat(String(token_begin, pos - token_begin)));
|
||||||
|
token_begin = pos + 1;
|
||||||
|
delimiters.emplace_back();
|
||||||
|
state = Delimiter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (state != Delimiter)
|
||||||
|
throwInvalidFormat("Unbalanced parentheses", columnsCount());
|
||||||
|
delimiters.back().append(token_begin, pos - token_begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) const
|
||||||
|
{
|
||||||
|
if (col_format.empty())
|
||||||
|
return ColumnFormat::None;
|
||||||
|
else if (col_format == "None")
|
||||||
|
return ColumnFormat::None;
|
||||||
|
else if (col_format == "Escaped")
|
||||||
|
return ColumnFormat::Escaped;
|
||||||
|
else if (col_format == "Quoted")
|
||||||
|
return ColumnFormat::Quoted;
|
||||||
|
else if (col_format == "CSV")
|
||||||
|
return ColumnFormat::Csv;
|
||||||
|
else if (col_format == "JSON")
|
||||||
|
return ColumnFormat::Json;
|
||||||
|
else if (col_format == "XML")
|
||||||
|
return ColumnFormat::Xml;
|
||||||
|
else if (col_format == "Raw")
|
||||||
|
return ColumnFormat::Raw;
|
||||||
|
else
|
||||||
|
throwInvalidFormat("Unknown field format " + col_format, columnsCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ParsedTemplateFormatString::columnsCount() const
|
||||||
|
{
|
||||||
|
return format_idx_to_column_idx.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format)
|
||||||
|
{
|
||||||
|
switch (format)
|
||||||
|
{
|
||||||
|
case ColumnFormat::None:
|
||||||
|
return "None";
|
||||||
|
case ColumnFormat::Escaped:
|
||||||
|
return "Escaped";
|
||||||
|
case ColumnFormat::Quoted:
|
||||||
|
return "Quoted";
|
||||||
|
case ColumnFormat::Csv:
|
||||||
|
return "CSV";
|
||||||
|
case ColumnFormat::Json:
|
||||||
|
return "Json";
|
||||||
|
case ColumnFormat::Xml:
|
||||||
|
return "Xml";
|
||||||
|
case ColumnFormat::Raw:
|
||||||
|
return "Raw";
|
||||||
|
}
|
||||||
|
__builtin_unreachable();
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s)
|
||||||
|
{
|
||||||
|
s.clear();
|
||||||
|
if (!size)
|
||||||
|
return pos;
|
||||||
|
ReadBufferFromMemory buf{pos, size};
|
||||||
|
if (*pos == '"')
|
||||||
|
readDoubleQuotedStringWithSQLStyle(s, buf);
|
||||||
|
else if (*pos == '`')
|
||||||
|
readBackQuotedStringWithSQLStyle(s, buf);
|
||||||
|
else if (isWordCharASCII(*pos))
|
||||||
|
{
|
||||||
|
size_t name_size = 1;
|
||||||
|
while (name_size < size && isWordCharASCII(*(pos + name_size)))
|
||||||
|
++name_size;
|
||||||
|
s = String{pos, name_size};
|
||||||
|
return pos + name_size;
|
||||||
|
}
|
||||||
|
return pos + buf.count();
|
||||||
|
}
|
||||||
|
|
||||||
|
String ParsedTemplateFormatString::dump() const
|
||||||
|
{
|
||||||
|
WriteBufferFromOwnString res;
|
||||||
|
res << "Delimiter " << 0 << ": ";
|
||||||
|
verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res);
|
||||||
|
|
||||||
|
size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size());
|
||||||
|
for (size_t i = 0; i < num_columns; ++i)
|
||||||
|
{
|
||||||
|
res << "\nColumn " << i << ": \"";
|
||||||
|
if (column_names.size() <= i)
|
||||||
|
res << "<ERROR>";
|
||||||
|
else if (column_names[i].empty())
|
||||||
|
res << "<SKIPPED>";
|
||||||
|
else
|
||||||
|
res << column_names[i];
|
||||||
|
|
||||||
|
res << "\" (mapped to table column ";
|
||||||
|
if (format_idx_to_column_idx.size() <= i)
|
||||||
|
res << "<ERROR>";
|
||||||
|
else if (!format_idx_to_column_idx[i])
|
||||||
|
res << "<SKIPPED>";
|
||||||
|
else
|
||||||
|
res << *format_idx_to_column_idx[i];
|
||||||
|
|
||||||
|
res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : "<ERROR>");
|
||||||
|
|
||||||
|
res << "\nDelimiter " << i + 1 << ": ";
|
||||||
|
if (delimiters.size() <= i + 1)
|
||||||
|
res << "<ERROR>";
|
||||||
|
else
|
||||||
|
verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const
|
||||||
|
{
|
||||||
|
throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) +
|
||||||
|
")" + ". Parsed format string:\n" + dump() + "\n",
|
||||||
|
ErrorCodes::INVALID_TEMPLATE_FORMAT);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
51
dbms/src/Formats/ParsedTemplateFormatString.h
Normal file
51
dbms/src/Formats/ParsedTemplateFormatString.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Core/Types.h>
|
||||||
|
#include <functional>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
struct ParsedTemplateFormatString
|
||||||
|
{
|
||||||
|
enum class ColumnFormat
|
||||||
|
{
|
||||||
|
None,
|
||||||
|
Escaped,
|
||||||
|
Quoted,
|
||||||
|
Csv,
|
||||||
|
Json,
|
||||||
|
Xml,
|
||||||
|
Raw
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2"
|
||||||
|
/// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size()
|
||||||
|
/// If format_idx_to_column_idx[i] has no value, then TemplateRowInputFormat will skip i-th column.
|
||||||
|
|
||||||
|
std::vector<String> delimiters;
|
||||||
|
std::vector<ColumnFormat> formats;
|
||||||
|
std::vector<std::optional<size_t>> format_idx_to_column_idx;
|
||||||
|
|
||||||
|
/// For diagnostic info
|
||||||
|
Strings column_names;
|
||||||
|
|
||||||
|
typedef std::function<std::optional<size_t>(const String &)> ColumnIdxGetter;
|
||||||
|
|
||||||
|
ParsedTemplateFormatString() = default;
|
||||||
|
ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name);
|
||||||
|
|
||||||
|
void parse(const String & format_string, const ColumnIdxGetter & idx_by_name);
|
||||||
|
|
||||||
|
ColumnFormat stringToFormat(const String & format) const;
|
||||||
|
static String formatToString(ColumnFormat format);
|
||||||
|
static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s);
|
||||||
|
size_t columnsCount() const;
|
||||||
|
|
||||||
|
String dump() const;
|
||||||
|
[[noreturn]] void throwInvalidFormat(const String & message, size_t column) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,504 +0,0 @@
|
|||||||
#include <string>
|
|
||||||
|
|
||||||
#include <Core/Defines.h>
|
|
||||||
|
|
||||||
#include <IO/ReadHelpers.h>
|
|
||||||
#include <IO/WriteBufferFromString.h>
|
|
||||||
#include <IO/Operators.h>
|
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowInputStream.h>
|
|
||||||
#include <Formats/verbosePrintString.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int INCORRECT_DATA;
|
|
||||||
extern const int LOGICAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void skipTSVRow(ReadBuffer & istr, const size_t num_columns)
|
|
||||||
{
|
|
||||||
NullSink null_sink;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
readEscapedStringInto(null_sink, istr);
|
|
||||||
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Check for a common error case - usage of Windows line feed.
|
|
||||||
*/
|
|
||||||
static void checkForCarriageReturn(ReadBuffer & istr)
|
|
||||||
{
|
|
||||||
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
|
|
||||||
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
|
|
||||||
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
|
|
||||||
" You must transform your file to Unix format."
|
|
||||||
"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
|
|
||||||
ErrorCodes::INCORRECT_DATA);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
TabSeparatedRowInputStream::TabSeparatedRowInputStream(
|
|
||||||
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
|
||||||
: istr(istr_), header(header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
|
|
||||||
{
|
|
||||||
const auto num_columns = header.columns();
|
|
||||||
|
|
||||||
data_types.resize(num_columns);
|
|
||||||
column_indexes_by_names.reserve(num_columns);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
const auto & column_info = header.getByPosition(i);
|
|
||||||
|
|
||||||
data_types[i] = column_info.type;
|
|
||||||
column_indexes_by_names.emplace(column_info.name, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
column_indexes_for_input_fields.reserve(num_columns);
|
|
||||||
read_columns.assign(num_columns, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::setupAllColumnsByTableSchema()
|
|
||||||
{
|
|
||||||
read_columns.assign(header.columns(), true);
|
|
||||||
column_indexes_for_input_fields.resize(header.columns());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
|
|
||||||
column_indexes_for_input_fields[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::addInputColumn(const String & column_name)
|
|
||||||
{
|
|
||||||
const auto column_it = column_indexes_by_names.find(column_name);
|
|
||||||
if (column_it == column_indexes_by_names.end())
|
|
||||||
{
|
|
||||||
if (format_settings.skip_unknown_fields)
|
|
||||||
{
|
|
||||||
column_indexes_for_input_fields.push_back(std::nullopt);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw Exception(
|
|
||||||
"Unknown field found in TSV header: '" + column_name + "' " +
|
|
||||||
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
|
|
||||||
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
|
|
||||||
ErrorCodes::INCORRECT_DATA
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto column_index = column_it->second;
|
|
||||||
|
|
||||||
if (read_columns[column_index])
|
|
||||||
throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
read_columns[column_index] = true;
|
|
||||||
column_indexes_for_input_fields.emplace_back(column_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension)
|
|
||||||
{
|
|
||||||
/// It is safe to memorize this on the first run - the format guarantees this does not change
|
|
||||||
if (unlikely(row_num == 1))
|
|
||||||
{
|
|
||||||
columns_to_fill_with_default_values.clear();
|
|
||||||
for (size_t index = 0; index < read_columns.size(); ++index)
|
|
||||||
if (read_columns[index] == 0)
|
|
||||||
columns_to_fill_with_default_values.push_back(index);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto column_index : columns_to_fill_with_default_values)
|
|
||||||
data_types[column_index]->insertDefaultInto(*columns[column_index]);
|
|
||||||
|
|
||||||
row_read_extension.read_columns = read_columns;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::readPrefix()
|
|
||||||
{
|
|
||||||
if (with_names || with_types)
|
|
||||||
{
|
|
||||||
/// In this format, we assume that column name or type cannot contain BOM,
|
|
||||||
/// so, if format has header,
|
|
||||||
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
|
|
||||||
skipBOMIfExists(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
if (format_settings.with_names_use_header)
|
|
||||||
{
|
|
||||||
String column_name;
|
|
||||||
do
|
|
||||||
{
|
|
||||||
readEscapedString(column_name, istr);
|
|
||||||
addInputColumn(column_name);
|
|
||||||
}
|
|
||||||
while (checkChar('\t', istr));
|
|
||||||
|
|
||||||
if (!istr.eof())
|
|
||||||
{
|
|
||||||
checkForCarriageReturn(istr);
|
|
||||||
assertChar('\n', istr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
setupAllColumnsByTableSchema();
|
|
||||||
skipTSVRow(istr, column_indexes_for_input_fields.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
setupAllColumnsByTableSchema();
|
|
||||||
|
|
||||||
if (with_types)
|
|
||||||
{
|
|
||||||
skipTSVRow(istr, column_indexes_for_input_fields.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
updateDiagnosticInfo();
|
|
||||||
|
|
||||||
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
|
|
||||||
{
|
|
||||||
const auto & column_index = column_indexes_for_input_fields[input_position];
|
|
||||||
if (column_index)
|
|
||||||
{
|
|
||||||
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], istr, format_settings);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
NullSink null_sink;
|
|
||||||
readEscapedStringInto(null_sink, istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// skip separators
|
|
||||||
if (input_position + 1 < column_indexes_for_input_fields.size())
|
|
||||||
{
|
|
||||||
assertChar('\t', istr);
|
|
||||||
}
|
|
||||||
else if (!istr.eof())
|
|
||||||
{
|
|
||||||
if (unlikely(row_num == 1))
|
|
||||||
checkForCarriageReturn(istr);
|
|
||||||
|
|
||||||
assertChar('\n', istr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fillUnreadColumnsWithDefaults(columns, ext);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
String TabSeparatedRowInputStream::getDiagnosticInfo()
|
|
||||||
{
|
|
||||||
if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
|
||||||
return {};
|
|
||||||
|
|
||||||
WriteBufferFromOwnString out;
|
|
||||||
MutableColumns columns = header.cloneEmptyColumns();
|
|
||||||
|
|
||||||
/// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
|
|
||||||
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t max_length_of_column_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
|
|
||||||
max_length_of_column_name = header.safeGetByPosition(i).name.size();
|
|
||||||
|
|
||||||
size_t max_length_of_data_type_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
|
||||||
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
|
|
||||||
|
|
||||||
/// Roll back the cursor to the beginning of the previous or current line and parse all over again. But now we derive detailed information.
|
|
||||||
|
|
||||||
if (pos_of_prev_row)
|
|
||||||
{
|
|
||||||
istr.position() = pos_of_prev_row;
|
|
||||||
|
|
||||||
out << "\nRow " << (row_num - 1) << ":\n";
|
|
||||||
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!pos_of_current_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
istr.position() = pos_of_current_row;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\nRow " << row_num << ":\n";
|
|
||||||
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** gcc-7 generates wrong code with optimization level greater than 1.
|
|
||||||
* See tests: dbms/src/IO/tests/write_int.cpp
|
|
||||||
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
|
|
||||||
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
|
|
||||||
* Nevertheless, we don't need high optimization of this function.
|
|
||||||
*/
|
|
||||||
bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
|
|
||||||
MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
|
||||||
{
|
|
||||||
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
|
|
||||||
{
|
|
||||||
if (input_position == 0 && istr.eof())
|
|
||||||
{
|
|
||||||
out << "<End of stream>\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (column_indexes_for_input_fields[input_position].has_value())
|
|
||||||
{
|
|
||||||
const auto & column_index = *column_indexes_for_input_fields[input_position];
|
|
||||||
const auto & current_column_type = data_types[column_index];
|
|
||||||
|
|
||||||
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
|
|
||||||
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
|
|
||||||
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
|
|
||||||
|
|
||||||
auto prev_position = istr.position();
|
|
||||||
std::exception_ptr exception;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
current_column_type->deserializeAsTextEscaped(*columns[column_index], istr, format_settings);
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
exception = std::current_exception();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto curr_position = istr.position();
|
|
||||||
|
|
||||||
if (curr_position < prev_position)
|
|
||||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
|
|
||||||
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
|
|
||||||
{
|
|
||||||
/// An empty string instead of a value.
|
|
||||||
if (curr_position == prev_position)
|
|
||||||
{
|
|
||||||
out << "ERROR: text ";
|
|
||||||
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
|
|
||||||
out << " is not like " << current_column_type->getName() << "\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "parsed text: ";
|
|
||||||
verbosePrintString(prev_position, curr_position, out);
|
|
||||||
|
|
||||||
if (exception)
|
|
||||||
{
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
else
|
|
||||||
out << "ERROR\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->haveMaximumSizeOfValue())
|
|
||||||
{
|
|
||||||
if (*curr_position != '\n' && *curr_position != '\t')
|
|
||||||
{
|
|
||||||
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
|
|
||||||
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
|
||||||
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
|
|
||||||
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
|
|
||||||
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
|
|
||||||
|
|
||||||
NullSink null_sink;
|
|
||||||
readEscapedStringInto(null_sink, istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Delimiters
|
|
||||||
if (input_position + 1 == column_indexes_for_input_fields.size())
|
|
||||||
{
|
|
||||||
if (!istr.eof())
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
assertChar('\n', istr);
|
|
||||||
}
|
|
||||||
catch (const DB::Exception &)
|
|
||||||
{
|
|
||||||
if (*istr.position() == '\t')
|
|
||||||
{
|
|
||||||
out << "ERROR: Tab found where line feed is expected."
|
|
||||||
" It's like your file has more columns than expected.\n"
|
|
||||||
"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
|
|
||||||
}
|
|
||||||
else if (*istr.position() == '\r')
|
|
||||||
{
|
|
||||||
out << "ERROR: Carriage return found where line feed is expected."
|
|
||||||
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
out << "ERROR: There is no line feed. ";
|
|
||||||
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
||||||
out << " found instead.\n";
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
assertChar('\t', istr);
|
|
||||||
}
|
|
||||||
catch (const DB::Exception &)
|
|
||||||
{
|
|
||||||
if (*istr.position() == '\n')
|
|
||||||
{
|
|
||||||
out << "ERROR: Line feed found where tab is expected."
|
|
||||||
" It's like your file has less columns than expected.\n"
|
|
||||||
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
|
|
||||||
}
|
|
||||||
else if (*istr.position() == '\r')
|
|
||||||
{
|
|
||||||
out << "ERROR: Carriage return found where tab is expected.\n";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
out << "ERROR: There is no tab. ";
|
|
||||||
verbosePrintString(istr.position(), istr.position() + 1, out);
|
|
||||||
out << " found instead.\n";
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::syncAfterError()
|
|
||||||
{
|
|
||||||
skipToUnescapedNextLineOrEOF(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputStream::updateDiagnosticInfo()
|
|
||||||
{
|
|
||||||
++row_num;
|
|
||||||
|
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
|
||||||
bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
|
|
||||||
|
|
||||||
pos_of_prev_row = pos_of_current_row;
|
|
||||||
pos_of_current_row = istr.position();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatTabSeparated(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
for (auto name : {"TabSeparated", "TSV"})
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(name, [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowInputStream>(buf, sample, false, false, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto name : {"TabSeparatedWithNames", "TSVWithNames"})
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(name, [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowInputStream>(buf, sample, true, false, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(name, [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowInputStream>(buf, sample, true, true, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,73 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <optional>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream to input data in tsv format.
|
|
||||||
*/
|
|
||||||
class TabSeparatedRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/** with_names - the first line is the header with the names of the columns
|
|
||||||
* with_types - on the next line header with type names
|
|
||||||
*/
|
|
||||||
TabSeparatedRowInputStream(
|
|
||||||
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension & ext) override;
|
|
||||||
void readPrefix() override;
|
|
||||||
bool allowSyncAfterError() const override { return true; }
|
|
||||||
void syncAfterError() override;
|
|
||||||
|
|
||||||
std::string getDiagnosticInfo() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
bool with_names;
|
|
||||||
bool with_types;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
DataTypes data_types;
|
|
||||||
|
|
||||||
using IndexesMap = std::unordered_map<String, size_t>;
|
|
||||||
IndexesMap column_indexes_by_names;
|
|
||||||
|
|
||||||
using OptionalIndexes = std::vector<std::optional<size_t>>;
|
|
||||||
OptionalIndexes column_indexes_for_input_fields;
|
|
||||||
|
|
||||||
std::vector<UInt8> read_columns;
|
|
||||||
std::vector<size_t> columns_to_fill_with_default_values;
|
|
||||||
|
|
||||||
void addInputColumn(const String & column_name);
|
|
||||||
void setupAllColumnsByTableSchema();
|
|
||||||
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
|
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
|
||||||
|
|
||||||
size_t row_num = 0;
|
|
||||||
|
|
||||||
/// How many bytes were read, not counting those still in the buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
|
|
||||||
|
|
||||||
char * pos_of_current_row = nullptr;
|
|
||||||
char * pos_of_prev_row = nullptr;
|
|
||||||
|
|
||||||
void updateDiagnosticInfo();
|
|
||||||
|
|
||||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -2,6 +2,3 @@ set(SRCS )
|
|||||||
|
|
||||||
add_executable (tab_separated_streams tab_separated_streams.cpp ${SRCS})
|
add_executable (tab_separated_streams tab_separated_streams.cpp ${SRCS})
|
||||||
target_link_libraries (tab_separated_streams PRIVATE dbms)
|
target_link_libraries (tab_separated_streams PRIVATE dbms)
|
||||||
|
|
||||||
add_executable (block_row_transforms block_row_transforms.cpp ${SRCS})
|
|
||||||
target_link_libraries (block_row_transforms PRIVATE dbms)
|
|
||||||
|
@ -1,57 +0,0 @@
|
|||||||
#include <string>
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Core/ColumnWithTypeAndName.h>
|
|
||||||
|
|
||||||
#include <IO/ReadBufferFromFile.h>
|
|
||||||
#include <IO/WriteBufferFromFile.h>
|
|
||||||
|
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
|
||||||
#include <DataTypes/DataTypeString.h>
|
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowInputStream.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
#include <DataStreams/copyData.h>
|
|
||||||
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
|
|
||||||
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
|
||||||
|
|
||||||
|
|
||||||
int main(int, char **)
|
|
||||||
try
|
|
||||||
{
|
|
||||||
using namespace DB;
|
|
||||||
|
|
||||||
Block sample;
|
|
||||||
|
|
||||||
ColumnWithTypeAndName col1;
|
|
||||||
col1.name = "col1";
|
|
||||||
col1.type = std::make_shared<DataTypeUInt64>();
|
|
||||||
col1.column = col1.type->createColumn();
|
|
||||||
sample.insert(col1);
|
|
||||||
|
|
||||||
ColumnWithTypeAndName col2;
|
|
||||||
col2.name = "col2";
|
|
||||||
col2.type = std::make_shared<DataTypeString>();
|
|
||||||
col2.column = col2.type->createColumn();
|
|
||||||
sample.insert(col2);
|
|
||||||
|
|
||||||
ReadBufferFromFile in_buf("test_in");
|
|
||||||
WriteBufferFromFile out_buf("test_out");
|
|
||||||
|
|
||||||
FormatSettings format_settings;
|
|
||||||
|
|
||||||
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
|
||||||
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
|
||||||
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, []{}, format_settings));
|
|
||||||
|
|
||||||
copyData(block_input, *block_output);
|
|
||||||
}
|
|
||||||
catch (const DB::Exception & e)
|
|
||||||
{
|
|
||||||
std::cerr << e.what() << ", " << e.displayText() << std::endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
@ -9,12 +9,12 @@
|
|||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <DataTypes/DataTypeString.h>
|
#include <DataTypes/DataTypeString.h>
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowInputStream.h>
|
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
#include <DataStreams/copyData.h>
|
#include <DataStreams/copyData.h>
|
||||||
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
||||||
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
|
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
|
||||||
|
#include <Processors/Formats/InputStreamFromInputFormat.h>
|
||||||
|
|
||||||
|
|
||||||
using namespace DB;
|
using namespace DB;
|
||||||
@ -39,13 +39,15 @@ try
|
|||||||
|
|
||||||
FormatSettings format_settings;
|
FormatSettings format_settings;
|
||||||
|
|
||||||
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
RowInputFormatParams params{DEFAULT_INSERT_BLOCK_SIZE, 0, 0, 0, []{}};
|
||||||
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
|
||||||
|
InputFormatPtr input_format = std::make_shared<TabSeparatedRowInputFormat>(sample, in_buf, params, false, false, format_settings);
|
||||||
|
BlockInputStreamPtr block_input = std::make_shared<InputStreamFromInputFormat>(std::move(input_format));
|
||||||
|
|
||||||
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(
|
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(
|
||||||
std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, [] {}, format_settings));
|
std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, [] {}, format_settings));
|
||||||
|
|
||||||
copyData(block_input, *block_output);
|
copyData(*block_input, *block_output);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
|
@ -332,7 +332,7 @@ UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (items == 0 && args.items_count != 0)
|
if (items == 0)
|
||||||
{
|
{
|
||||||
size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out);
|
size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out);
|
||||||
out += l;
|
out += l;
|
||||||
|
@ -336,6 +336,8 @@ void FunctionArrayEnumerateRankedExtended<Derived>::executeMethodImpl(
|
|||||||
/// Skipping offsets if no data in this array
|
/// Skipping offsets if no data in this array
|
||||||
if (prev_off == off)
|
if (prev_off == off)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
if (depth_to_look > 2)
|
||||||
want_clear = true;
|
want_clear = true;
|
||||||
|
|
||||||
if (depth_to_look >= 2)
|
if (depth_to_look >= 2)
|
||||||
|
316
dbms/src/IO/PeekableReadBuffer.cpp
Normal file
316
dbms/src/IO/PeekableReadBuffer.cpp
Normal file
@ -0,0 +1,316 @@
|
|||||||
|
#include <IO/PeekableReadBuffer.h>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/,
|
||||||
|
size_t unread_limit_ /* = default_limit*/)
|
||||||
|
: BufferWithOwnMemory(start_size_), sub_buf(sub_buf_), unread_limit(unread_limit_)
|
||||||
|
{
|
||||||
|
padded &= sub_buf.isPadded();
|
||||||
|
/// Read from sub-buffer
|
||||||
|
Buffer & sub_working = sub_buf.buffer();
|
||||||
|
BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset());
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PeekableReadBuffer::peekNext()
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
|
||||||
|
size_t bytes_read = 0;
|
||||||
|
Position copy_from = pos;
|
||||||
|
size_t bytes_to_copy = sub_buf.available();
|
||||||
|
if (useSubbufferOnly())
|
||||||
|
{
|
||||||
|
/// Don't have to copy all data from sub-buffer if there is no data in own memory (checkpoint and pos are in sub-buffer)
|
||||||
|
if (checkpoint)
|
||||||
|
copy_from = checkpoint;
|
||||||
|
bytes_read = copy_from - sub_buf.buffer().begin();
|
||||||
|
bytes_to_copy = sub_buf.buffer().end() - copy_from; /// sub_buf.available();
|
||||||
|
if (!bytes_to_copy)
|
||||||
|
{
|
||||||
|
bytes += bytes_read;
|
||||||
|
sub_buf.position() = copy_from;
|
||||||
|
|
||||||
|
/// Both checkpoint and pos are at the end of sub-buffer. Just load next part of data.
|
||||||
|
bool res = sub_buf.next();
|
||||||
|
BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset());
|
||||||
|
if (checkpoint)
|
||||||
|
checkpoint = pos;
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// May throw an exception
|
||||||
|
resizeOwnMemoryIfNecessary(bytes_to_copy);
|
||||||
|
|
||||||
|
if (useSubbufferOnly())
|
||||||
|
{
|
||||||
|
bytes += bytes_read;
|
||||||
|
sub_buf.position() = copy_from;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save unread data from sub-buffer to own memory
|
||||||
|
memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy);
|
||||||
|
|
||||||
|
/// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary
|
||||||
|
/// Otherwise, checkpoint now at the beginning of own memory
|
||||||
|
if (checkpoint && useSubbufferOnly())
|
||||||
|
{
|
||||||
|
checkpoint = memory.data();
|
||||||
|
checkpoint_in_own_memory = true;
|
||||||
|
}
|
||||||
|
if (currentlyReadFromOwnMemory())
|
||||||
|
{
|
||||||
|
/// Update buffer size
|
||||||
|
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, offset());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// Switch to reading from own memory
|
||||||
|
size_t pos_offset = peeked_size + this->offset();
|
||||||
|
if (useSubbufferOnly())
|
||||||
|
{
|
||||||
|
if (checkpoint)
|
||||||
|
pos_offset = bytes_to_copy;
|
||||||
|
else
|
||||||
|
pos_offset = 0;
|
||||||
|
}
|
||||||
|
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
peeked_size += bytes_to_copy;
|
||||||
|
sub_buf.position() += bytes_to_copy;
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
return sub_buf.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
void PeekableReadBuffer::setCheckpoint()
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
#ifndef NDEBUG
|
||||||
|
if (checkpoint)
|
||||||
|
throw DB::Exception("Does not support recursive checkpoints.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
#endif
|
||||||
|
checkpoint_in_own_memory = currentlyReadFromOwnMemory();
|
||||||
|
if (!checkpoint_in_own_memory)
|
||||||
|
{
|
||||||
|
/// Don't need to store unread data anymore
|
||||||
|
peeked_size = 0;
|
||||||
|
}
|
||||||
|
checkpoint = pos;
|
||||||
|
checkStateCorrect();
|
||||||
|
}
|
||||||
|
|
||||||
|
void PeekableReadBuffer::dropCheckpoint()
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
#ifndef NDEBUG
|
||||||
|
if (!checkpoint)
|
||||||
|
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
#endif
|
||||||
|
if (!currentlyReadFromOwnMemory())
|
||||||
|
{
|
||||||
|
/// Don't need to store unread data anymore
|
||||||
|
peeked_size = 0;
|
||||||
|
}
|
||||||
|
checkpoint = nullptr;
|
||||||
|
checkpoint_in_own_memory = false;
|
||||||
|
checkStateCorrect();
|
||||||
|
}
|
||||||
|
|
||||||
|
void PeekableReadBuffer::rollbackToCheckpoint()
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
if (!checkpoint)
|
||||||
|
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
|
||||||
|
pos = checkpoint;
|
||||||
|
else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory
|
||||||
|
BufferBase::set(memory.data(), peeked_size, checkpoint - memory.data());
|
||||||
|
checkStateCorrect();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PeekableReadBuffer::nextImpl()
|
||||||
|
{
|
||||||
|
/// FIXME wrong bytes count because it can read the same data again after rollbackToCheckpoint()
|
||||||
|
/// However, changing bytes count on every call of next() (even after rollback) allows to determine if some pointers were invalidated.
|
||||||
|
checkStateCorrect();
|
||||||
|
bool res;
|
||||||
|
|
||||||
|
if (!checkpoint)
|
||||||
|
{
|
||||||
|
if (!useSubbufferOnly())
|
||||||
|
{
|
||||||
|
/// All copied data have been read from own memory, continue reading from sub_buf
|
||||||
|
peeked_size = 0;
|
||||||
|
res = sub_buf.hasPendingData() || sub_buf.next();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// Load next data to sub_buf
|
||||||
|
sub_buf.position() = pos;
|
||||||
|
res = sub_buf.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
Buffer & sub_working = sub_buf.buffer();
|
||||||
|
/// Switch to reading from sub_buf (or just update it if already switched)
|
||||||
|
BufferBase::set(sub_working.begin(), sub_working.size(), 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (currentlyReadFromOwnMemory())
|
||||||
|
res = sub_buf.hasPendingData() || sub_buf.next();
|
||||||
|
else
|
||||||
|
res = peekNext();
|
||||||
|
Buffer & sub_working = sub_buf.buffer();
|
||||||
|
BufferBase::set(sub_working.begin(), sub_working.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PeekableReadBuffer::useSubbufferOnly() const
|
||||||
|
{
|
||||||
|
return !peeked_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PeekableReadBuffer::checkStateCorrect() const
|
||||||
|
{
|
||||||
|
#ifndef NDEBUG
|
||||||
|
if (checkpoint)
|
||||||
|
{
|
||||||
|
if (checkpointInOwnMemory())
|
||||||
|
{
|
||||||
|
if (!peeked_size)
|
||||||
|
throw DB::Exception("Checkpoint in empty own buffer", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
if (currentlyReadFromOwnMemory() && pos < checkpoint)
|
||||||
|
throw DB::Exception("Current position in own buffer before checkpoint in own buffer", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (peeked_size)
|
||||||
|
throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
if (currentlyReadFromOwnMemory())
|
||||||
|
throw DB::Exception("Current position in own buffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
if (pos < checkpoint)
|
||||||
|
throw DB::Exception("Current position in subbuffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!currentlyReadFromOwnMemory() && peeked_size)
|
||||||
|
throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
if (currentlyReadFromOwnMemory() && !peeked_size)
|
||||||
|
throw DB::Exception("Pos in empty own buffer", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
if (unread_limit < memory.size())
|
||||||
|
throw DB::Exception("Size limit exceed", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append)
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
bool needUpdateCheckpoint = checkpointInOwnMemory();
|
||||||
|
bool needUpdatePos = currentlyReadFromOwnMemory();
|
||||||
|
size_t offset = 0;
|
||||||
|
if (needUpdateCheckpoint)
|
||||||
|
offset = checkpoint - memory.data();
|
||||||
|
else if (needUpdatePos)
|
||||||
|
offset = this->offset();
|
||||||
|
|
||||||
|
size_t new_size = peeked_size + bytes_to_append;
|
||||||
|
if (memory.size() < new_size)
|
||||||
|
{
|
||||||
|
if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size())
|
||||||
|
{
|
||||||
|
/// Move unread data to the beginning of own memory instead of resize own memory
|
||||||
|
peeked_size -= offset;
|
||||||
|
memmove(memory.data(), memory.data() + offset, peeked_size);
|
||||||
|
bytes += offset;
|
||||||
|
|
||||||
|
if (needUpdateCheckpoint)
|
||||||
|
checkpoint -= offset;
|
||||||
|
if (needUpdatePos)
|
||||||
|
pos -= offset;
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (unread_limit < new_size)
|
||||||
|
throw DB::Exception("PeekableReadBuffer: Memory limit exceed", ErrorCodes::MEMORY_LIMIT_EXCEEDED);
|
||||||
|
|
||||||
|
size_t pos_offset = pos - memory.data();
|
||||||
|
|
||||||
|
size_t new_size_amortized = memory.size() * 2;
|
||||||
|
if (new_size_amortized < new_size)
|
||||||
|
new_size_amortized = new_size;
|
||||||
|
else if (unread_limit < new_size_amortized)
|
||||||
|
new_size_amortized = unread_limit;
|
||||||
|
memory.resize(new_size_amortized);
|
||||||
|
|
||||||
|
if (needUpdateCheckpoint)
|
||||||
|
checkpoint = memory.data() + offset;
|
||||||
|
if (needUpdatePos)
|
||||||
|
{
|
||||||
|
BufferBase::set(memory.data(), peeked_size, pos_offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
checkStateCorrect();
|
||||||
|
return offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
PeekableReadBuffer::~PeekableReadBuffer()
|
||||||
|
{
|
||||||
|
if (!currentlyReadFromOwnMemory())
|
||||||
|
sub_buf.position() = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<BufferWithOwnMemory<ReadBuffer>> PeekableReadBuffer::takeUnreadData()
|
||||||
|
{
|
||||||
|
checkStateCorrect();
|
||||||
|
if (!currentlyReadFromOwnMemory())
|
||||||
|
return std::make_shared<BufferWithOwnMemory<ReadBuffer>>(0);
|
||||||
|
size_t unread_size = memory.data() + peeked_size - pos;
|
||||||
|
auto unread = std::make_shared<BufferWithOwnMemory<ReadBuffer>>(unread_size);
|
||||||
|
memcpy(unread->buffer().begin(), pos, unread_size);
|
||||||
|
unread->BufferBase::set(unread->buffer().begin(), unread_size, 0);
|
||||||
|
peeked_size = 0;
|
||||||
|
checkpoint = nullptr;
|
||||||
|
checkpoint_in_own_memory = false;
|
||||||
|
BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset());
|
||||||
|
checkStateCorrect();
|
||||||
|
return unread;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PeekableReadBuffer::currentlyReadFromOwnMemory() const
|
||||||
|
{
|
||||||
|
return working_buffer.begin() != sub_buf.buffer().begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PeekableReadBuffer::checkpointInOwnMemory() const
|
||||||
|
{
|
||||||
|
return checkpoint_in_own_memory;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PeekableReadBuffer::assertCanBeDestructed() const
|
||||||
|
{
|
||||||
|
if (peeked_size && pos != memory.data() + peeked_size)
|
||||||
|
throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer. "
|
||||||
|
"Cannot destruct peekable buffer correctly because tha data will be lost."
|
||||||
|
"Most likely it's a bug.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
96
dbms/src/IO/PeekableReadBuffer.h
Normal file
96
dbms/src/IO/PeekableReadBuffer.h
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <IO/ReadBuffer.h>
|
||||||
|
#include <IO/BufferWithOwnMemory.h>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int MEMORY_LIMIT_EXCEEDED;
|
||||||
|
extern const int LOGICAL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Allows to peek next part of data from sub-buffer without extracting it.
|
||||||
|
/// Also allows to set checkpoint at some position in stream and come back to this position later,
|
||||||
|
/// even if next() was called.
|
||||||
|
/// Sub-buffer should not be accessed directly during the lifelime of peekable buffer.
|
||||||
|
/// If position() of peekable buffer is explicitly set to some position before checkpoint
|
||||||
|
/// (e.g. by istr.position() = prev_pos), behavior is undefined.
|
||||||
|
class PeekableReadBuffer : public BufferWithOwnMemory<ReadBuffer>
|
||||||
|
{
|
||||||
|
friend class PeekableReadBufferCheckpoint;
|
||||||
|
public:
|
||||||
|
explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE,
|
||||||
|
size_t unread_limit_ = 16 * DBMS_DEFAULT_BUFFER_SIZE);
|
||||||
|
|
||||||
|
/// Use takeUnreadData() to extract unread data before destruct object
|
||||||
|
~PeekableReadBuffer() override;
|
||||||
|
|
||||||
|
/// Saves unread data to own memory, so it will be possible to read it later. Loads next data to sub-buffer.
|
||||||
|
/// Doesn't change checkpoint and position in stream,
|
||||||
|
/// but all pointers (such as this->buffer().end() and this->position()) may be invalidated
|
||||||
|
/// @returns false in case of EOF in sub-buffer, otherwise returns true
|
||||||
|
bool peekNext();
|
||||||
|
|
||||||
|
Buffer & lastPeeked() { return sub_buf.buffer(); }
|
||||||
|
|
||||||
|
/// Sets checkpoint at current position
|
||||||
|
void setCheckpoint();
|
||||||
|
|
||||||
|
/// Forget checkpoint and all data between checkpoint and position
|
||||||
|
void dropCheckpoint();
|
||||||
|
|
||||||
|
/// Sets position at checkpoint.
|
||||||
|
/// All pointers (such as this->buffer().end()) may be invalidated
|
||||||
|
void rollbackToCheckpoint();
|
||||||
|
|
||||||
|
/// If position is in own memory, returns buffer with data, which were extracted from sub-buffer,
|
||||||
|
/// but not from this buffer, so the data will not be lost after destruction of this buffer.
|
||||||
|
/// If position is in sub-buffer, returns empty buffer.
|
||||||
|
std::shared_ptr<BufferWithOwnMemory<ReadBuffer>> takeUnreadData();
|
||||||
|
void assertCanBeDestructed() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
bool nextImpl() override;
|
||||||
|
|
||||||
|
inline bool useSubbufferOnly() const;
|
||||||
|
inline bool currentlyReadFromOwnMemory() const;
|
||||||
|
inline bool checkpointInOwnMemory() const;
|
||||||
|
|
||||||
|
void checkStateCorrect() const;
|
||||||
|
|
||||||
|
/// Makes possible to append `bytes_to_append` bytes to data in own memory.
|
||||||
|
/// Updates all invalidated pointers and sizes.
|
||||||
|
/// @returns new offset of unread data in own memory
|
||||||
|
size_t resizeOwnMemoryIfNecessary(size_t bytes_to_append);
|
||||||
|
|
||||||
|
|
||||||
|
ReadBuffer & sub_buf;
|
||||||
|
const size_t unread_limit;
|
||||||
|
size_t peeked_size = 0;
|
||||||
|
Position checkpoint = nullptr;
|
||||||
|
bool checkpoint_in_own_memory = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class PeekableReadBufferCheckpoint : boost::noncopyable
|
||||||
|
{
|
||||||
|
PeekableReadBuffer & buf;
|
||||||
|
bool auto_rollback;
|
||||||
|
public:
|
||||||
|
explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_, bool auto_rollback_ = false)
|
||||||
|
: buf(buf_), auto_rollback(auto_rollback_) { buf.setCheckpoint(); }
|
||||||
|
~PeekableReadBufferCheckpoint()
|
||||||
|
{
|
||||||
|
if (!buf.checkpoint)
|
||||||
|
return;
|
||||||
|
if (auto_rollback)
|
||||||
|
buf.rollbackToCheckpoint();
|
||||||
|
buf.dropCheckpoint();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
131
dbms/src/IO/tests/gtest_peekable_read_buffer.cpp
Normal file
131
dbms/src/IO/tests/gtest_peekable_read_buffer.cpp
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <Core/Types.h>
|
||||||
|
#include <IO/ReadHelpers.h>
|
||||||
|
#include <IO/ReadBufferFromString.h>
|
||||||
|
#include <IO/ConcatReadBuffer.h>
|
||||||
|
#include <IO/PeekableReadBuffer.h>
|
||||||
|
|
||||||
|
void readAndAssert(DB::ReadBuffer & buf, const char * str)
|
||||||
|
{
|
||||||
|
size_t n = strlen(str);
|
||||||
|
char tmp[n];
|
||||||
|
buf.readStrict(tmp, n);
|
||||||
|
ASSERT_EQ(strncmp(tmp, str, n), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void assertAvailable(DB::ReadBuffer & buf, const char * str)
|
||||||
|
{
|
||||||
|
size_t n = strlen(str);
|
||||||
|
ASSERT_EQ(buf.available(), n);
|
||||||
|
ASSERT_EQ(strncmp(buf.position(), str, n), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PeekableReadBuffer, CheckpointsWorkCorrectly)
|
||||||
|
try
|
||||||
|
{
|
||||||
|
std::string s1 = "0123456789";
|
||||||
|
std::string s2 = "qwertyuiop";
|
||||||
|
std::string s3 = "asdfghjkl;";
|
||||||
|
std::string s4 = "zxcvbnm,./";
|
||||||
|
DB::ReadBufferFromString b1(s1);
|
||||||
|
DB::ReadBufferFromString b2(s2);
|
||||||
|
DB::ReadBufferFromString b3(s3);
|
||||||
|
DB::ReadBufferFromString b4(s4);
|
||||||
|
|
||||||
|
DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4});
|
||||||
|
DB::PeekableReadBuffer peekable(concat, 0, 16);
|
||||||
|
|
||||||
|
ASSERT_TRUE(!peekable.eof());
|
||||||
|
assertAvailable(peekable, "0123456789");
|
||||||
|
{
|
||||||
|
DB::PeekableReadBufferCheckpoint checkpoint{peekable};
|
||||||
|
readAndAssert(peekable, "01234");
|
||||||
|
}
|
||||||
|
bool exception = false;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
peekable.rollbackToCheckpoint();
|
||||||
|
}
|
||||||
|
catch (DB::Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
|
||||||
|
throw;
|
||||||
|
exception = true;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(exception);
|
||||||
|
assertAvailable(peekable, "56789");
|
||||||
|
|
||||||
|
readAndAssert(peekable, "56");
|
||||||
|
|
||||||
|
peekable.setCheckpoint();
|
||||||
|
readAndAssert(peekable, "789qwertyu");
|
||||||
|
peekable.rollbackToCheckpoint();
|
||||||
|
peekable.dropCheckpoint();
|
||||||
|
assertAvailable(peekable, "789");
|
||||||
|
peekable.peekNext();
|
||||||
|
assertAvailable(peekable, "789qwertyuiop");
|
||||||
|
ASSERT_EQ(peekable.lastPeeked().size(), 10);
|
||||||
|
ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0);
|
||||||
|
|
||||||
|
exception = false;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
DB::PeekableReadBufferCheckpoint checkpoint{peekable, true};
|
||||||
|
peekable.ignore(30);
|
||||||
|
}
|
||||||
|
catch (DB::Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED)
|
||||||
|
throw;
|
||||||
|
exception = true;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(exception);
|
||||||
|
assertAvailable(peekable, "789qwertyuiop");
|
||||||
|
ASSERT_EQ(peekable.lastPeeked().size(), 10);
|
||||||
|
ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0);
|
||||||
|
|
||||||
|
readAndAssert(peekable, "789qwertyu");
|
||||||
|
peekable.setCheckpoint();
|
||||||
|
readAndAssert(peekable, "iopasdfghj");
|
||||||
|
assertAvailable(peekable, "kl;");
|
||||||
|
peekable.dropCheckpoint();
|
||||||
|
|
||||||
|
peekable.setCheckpoint();
|
||||||
|
readAndAssert(peekable, "kl;zxcvbnm,./");
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
peekable.rollbackToCheckpoint();
|
||||||
|
readAndAssert(peekable, "kl;zxcvbnm");
|
||||||
|
peekable.dropCheckpoint();
|
||||||
|
|
||||||
|
exception = false;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
peekable.assertCanBeDestructed();
|
||||||
|
}
|
||||||
|
catch (DB::Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
|
||||||
|
throw;
|
||||||
|
exception = true;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(exception);
|
||||||
|
|
||||||
|
auto buf_ptr = peekable.takeUnreadData();
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
ASSERT_TRUE(peekable.eof());
|
||||||
|
|
||||||
|
readAndAssert(*buf_ptr, ",./");
|
||||||
|
ASSERT_TRUE(buf_ptr->eof());
|
||||||
|
|
||||||
|
peekable.assertCanBeDestructed();
|
||||||
|
}
|
||||||
|
catch (const DB::Exception & e)
|
||||||
|
{
|
||||||
|
std::cerr << e.what() << ", " << e.displayText() << std::endl;
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
@ -1,13 +1,18 @@
|
|||||||
#include <Interpreters/AnalyzedJoin.h>
|
#include <Interpreters/AnalyzedJoin.h>
|
||||||
#include <Interpreters/DatabaseAndTableWithAlias.h>
|
#include <Interpreters/DatabaseAndTableWithAlias.h>
|
||||||
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
|
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
|
||||||
|
#include <Interpreters/Join.h>
|
||||||
|
|
||||||
#include <Parsers/ASTExpressionList.h>
|
#include <Parsers/ASTExpressionList.h>
|
||||||
#include <Parsers/ASTTablesInSelectQuery.h>
|
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||||
#include <Parsers/ASTSelectQuery.h>
|
#include <Parsers/ASTSelectQuery.h>
|
||||||
|
|
||||||
|
#include <Core/Block.h>
|
||||||
#include <Storages/IStorage.h>
|
#include <Storages/IStorage.h>
|
||||||
|
|
||||||
|
#include <DataTypes/DataTypeNullable.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -26,7 +31,6 @@ void AnalyzedJoin::addUsingKey(const ASTPtr & ast)
|
|||||||
|
|
||||||
void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
|
void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
|
||||||
{
|
{
|
||||||
with_using = false;
|
|
||||||
key_names_left.push_back(left_table_ast->getColumnName());
|
key_names_left.push_back(left_table_ast->getColumnName());
|
||||||
key_names_right.push_back(right_table_ast->getAliasOrColumnName());
|
key_names_right.push_back(right_table_ast->getAliasOrColumnName());
|
||||||
|
|
||||||
@ -37,7 +41,7 @@ void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
|
|||||||
/// @return how many times right key appears in ON section.
|
/// @return how many times right key appears in ON section.
|
||||||
size_t AnalyzedJoin::rightKeyInclusion(const String & name) const
|
size_t AnalyzedJoin::rightKeyInclusion(const String & name) const
|
||||||
{
|
{
|
||||||
if (with_using)
|
if (hasUsing())
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
@ -101,6 +105,120 @@ std::unordered_map<String, String> AnalyzedJoin::getOriginalColumnsMap(const Nam
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ASTPtr AnalyzedJoin::leftKeysList() const
|
||||||
|
{
|
||||||
|
ASTPtr keys_list = std::make_shared<ASTExpressionList>();
|
||||||
|
keys_list->children = key_asts_left;
|
||||||
|
return keys_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASTPtr AnalyzedJoin::rightKeysList() const
|
||||||
|
{
|
||||||
|
ASTPtr keys_list = std::make_shared<ASTExpressionList>();
|
||||||
|
if (hasOn())
|
||||||
|
keys_list->children = key_asts_right;
|
||||||
|
return keys_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
Names AnalyzedJoin::requiredJoinedNames() const
|
||||||
|
{
|
||||||
|
NameSet required_columns_set(key_names_right.begin(), key_names_right.end());
|
||||||
|
for (const auto & joined_column : columns_added_by_join)
|
||||||
|
required_columns_set.insert(joined_column.name);
|
||||||
|
|
||||||
|
return Names(required_columns_set.begin(), required_columns_set.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnalyzedJoin::appendRequiredColumns(const Block & sample, NameSet & required_columns) const
|
||||||
|
{
|
||||||
|
for (auto & column : key_names_right)
|
||||||
|
if (!sample.has(column))
|
||||||
|
required_columns.insert(column);
|
||||||
|
|
||||||
|
for (auto & column : columns_added_by_join)
|
||||||
|
if (!sample.has(column.name))
|
||||||
|
required_columns.insert(column.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnalyzedJoin::addJoinedColumn(const NameAndTypePair & joined_column)
|
||||||
|
{
|
||||||
|
if (join_use_nulls && isLeftOrFull(table_join.kind))
|
||||||
|
{
|
||||||
|
auto type = joined_column.type->canBeInsideNullable() ? makeNullable(joined_column.type) : joined_column.type;
|
||||||
|
columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, std::move(type)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
columns_added_by_join.push_back(joined_column);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnalyzedJoin::addJoinedColumnsAndCorrectNullability(Block & sample_block) const
|
||||||
|
{
|
||||||
|
bool right_or_full_join = isRightOrFull(table_join.kind);
|
||||||
|
bool left_or_full_join = isLeftOrFull(table_join.kind);
|
||||||
|
|
||||||
|
for (auto & col : sample_block)
|
||||||
|
{
|
||||||
|
/// Materialize column.
|
||||||
|
/// Column is not empty if it is constant, but after Join all constants will be materialized.
|
||||||
|
/// So, we need remove constants from header.
|
||||||
|
if (col.column)
|
||||||
|
col.column = nullptr;
|
||||||
|
|
||||||
|
bool make_nullable = join_use_nulls && right_or_full_join;
|
||||||
|
|
||||||
|
if (make_nullable && col.type->canBeInsideNullable())
|
||||||
|
col.type = makeNullable(col.type);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & col : columns_added_by_join)
|
||||||
|
{
|
||||||
|
auto res_type = col.type;
|
||||||
|
|
||||||
|
bool make_nullable = join_use_nulls && left_or_full_join;
|
||||||
|
|
||||||
|
if (!make_nullable)
|
||||||
|
{
|
||||||
|
/// Keys from right table are usually not stored in Join, but copied from the left one.
|
||||||
|
/// So, if left key is nullable, let's make right key nullable too.
|
||||||
|
/// Note: for some join types it's not needed and, probably, may be removed.
|
||||||
|
/// Note: changing this code, take into account the implementation in Join.cpp.
|
||||||
|
auto it = std::find(key_names_right.begin(), key_names_right.end(), col.name);
|
||||||
|
if (it != key_names_right.end())
|
||||||
|
{
|
||||||
|
auto pos = it - key_names_right.begin();
|
||||||
|
const auto & left_key_name = key_names_left[pos];
|
||||||
|
make_nullable = sample_block.getByName(left_key_name).type->isNullable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (make_nullable && res_type->canBeInsideNullable())
|
||||||
|
res_type = makeNullable(res_type);
|
||||||
|
|
||||||
|
sample_block.insert(ColumnWithTypeAndName(nullptr, res_type, col.name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AnalyzedJoin::sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y)
|
||||||
|
{
|
||||||
|
if (!x && !y)
|
||||||
|
return true;
|
||||||
|
if (!x || !y)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return x->table_join.kind == y->table_join.kind
|
||||||
|
&& x->table_join.strictness == y->table_join.strictness
|
||||||
|
&& x->key_names_left == y->key_names_left
|
||||||
|
&& x->key_names_right == y->key_names_right
|
||||||
|
&& x->columns_added_by_join == y->columns_added_by_join;
|
||||||
|
}
|
||||||
|
|
||||||
|
JoinPtr AnalyzedJoin::makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const
|
||||||
|
{
|
||||||
|
auto join = std::make_shared<Join>(key_names_right, join_use_nulls, size_limits_for_join, table_join.kind, table_join.strictness);
|
||||||
|
join->setSampleBlock(sample_block);
|
||||||
|
return join;
|
||||||
|
}
|
||||||
|
|
||||||
NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context)
|
NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context)
|
||||||
{
|
{
|
||||||
NamesAndTypesList names_and_type_list;
|
NamesAndTypesList names_and_type_list;
|
||||||
|
@ -2,7 +2,8 @@
|
|||||||
|
|
||||||
#include <Core/Names.h>
|
#include <Core/Names.h>
|
||||||
#include <Core/NamesAndTypes.h>
|
#include <Core/NamesAndTypes.h>
|
||||||
#include <Parsers/IAST.h>
|
#include <Core/SettingsCommon.h>
|
||||||
|
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||||
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -13,6 +14,10 @@ namespace DB
|
|||||||
class Context;
|
class Context;
|
||||||
class ASTSelectQuery;
|
class ASTSelectQuery;
|
||||||
struct DatabaseAndTableWithAlias;
|
struct DatabaseAndTableWithAlias;
|
||||||
|
class Block;
|
||||||
|
|
||||||
|
class Join;
|
||||||
|
using JoinPtr = std::shared_ptr<Join>;
|
||||||
|
|
||||||
struct AnalyzedJoin
|
struct AnalyzedJoin
|
||||||
{
|
{
|
||||||
@ -30,18 +35,19 @@ struct AnalyzedJoin
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
friend class SyntaxAnalyzer;
|
friend class SyntaxAnalyzer;
|
||||||
friend struct SyntaxAnalyzerResult;
|
|
||||||
friend class ExpressionAnalyzer;
|
|
||||||
friend class SelectQueryExpressionAnalyzer;
|
|
||||||
|
|
||||||
Names key_names_left;
|
Names key_names_left;
|
||||||
Names key_names_right; /// Duplicating names are qualified.
|
Names key_names_right; /// Duplicating names are qualified.
|
||||||
ASTs key_asts_left;
|
ASTs key_asts_left;
|
||||||
ASTs key_asts_right;
|
ASTs key_asts_right;
|
||||||
bool with_using = true;
|
ASTTableJoin table_join;
|
||||||
|
bool join_use_nulls = false;
|
||||||
|
|
||||||
/// All columns which can be read from joined table. Duplicating names are qualified.
|
/// All columns which can be read from joined table. Duplicating names are qualified.
|
||||||
NamesAndTypesList columns_from_joined_table;
|
NamesAndTypesList columns_from_joined_table;
|
||||||
|
/// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability
|
||||||
|
NamesAndTypesList columns_added_by_join;
|
||||||
|
|
||||||
/// Name -> original name. Names are the same as in columns_from_joined_table list.
|
/// Name -> original name. Names are the same as in columns_from_joined_table list.
|
||||||
std::unordered_map<String, String> original_names;
|
std::unordered_map<String, String> original_names;
|
||||||
/// Original name -> name. Only ranamed columns.
|
/// Original name -> name. Only ranamed columns.
|
||||||
@ -51,8 +57,8 @@ public:
|
|||||||
void addUsingKey(const ASTPtr & ast);
|
void addUsingKey(const ASTPtr & ast);
|
||||||
void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast);
|
void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast);
|
||||||
|
|
||||||
bool hasUsing() const { return with_using; }
|
bool hasUsing() const { return table_join.using_expression_list != nullptr; }
|
||||||
bool hasOn() const { return !with_using; }
|
bool hasOn() const { return !hasUsing(); }
|
||||||
|
|
||||||
NameSet getQualifiedColumnsSet() const;
|
NameSet getQualifiedColumnsSet() const;
|
||||||
NameSet getOriginalColumnsSet() const;
|
NameSet getOriginalColumnsSet() const;
|
||||||
@ -60,6 +66,22 @@ public:
|
|||||||
|
|
||||||
void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix);
|
void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix);
|
||||||
size_t rightKeyInclusion(const String & name) const;
|
size_t rightKeyInclusion(const String & name) const;
|
||||||
|
|
||||||
|
void appendRequiredColumns(const Block & sample, NameSet & required_columns) const;
|
||||||
|
void addJoinedColumn(const NameAndTypePair & joined_column);
|
||||||
|
void addJoinedColumnsAndCorrectNullability(Block & sample_block) const;
|
||||||
|
|
||||||
|
ASTPtr leftKeysList() const;
|
||||||
|
ASTPtr rightKeysList() const; /// For ON syntax only
|
||||||
|
|
||||||
|
Names requiredJoinedNames() const;
|
||||||
|
const Names & keyNamesLeft() const { return key_names_left; }
|
||||||
|
const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; }
|
||||||
|
const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; }
|
||||||
|
|
||||||
|
JoinPtr makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const;
|
||||||
|
|
||||||
|
static bool sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ASTTableExpression;
|
struct ASTTableExpression;
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <DataTypes/DataTypeNullable.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace ProfileEvents
|
namespace ProfileEvents
|
||||||
@ -45,7 +44,8 @@ Names ExpressionAction::getNeededColumns() const
|
|||||||
|
|
||||||
res.insert(res.end(), array_joined_columns.begin(), array_joined_columns.end());
|
res.insert(res.end(), array_joined_columns.begin(), array_joined_columns.end());
|
||||||
|
|
||||||
res.insert(res.end(), join_key_names_left.begin(), join_key_names_left.end());
|
if (join_params)
|
||||||
|
res.insert(res.end(), join_params->keyNamesLeft().begin(), join_params->keyNamesLeft().end());
|
||||||
|
|
||||||
for (const auto & column : projection)
|
for (const auto & column : projection)
|
||||||
res.push_back(column.first);
|
res.push_back(column.first);
|
||||||
@ -159,20 +159,12 @@ ExpressionAction ExpressionAction::arrayJoin(const NameSet & array_joined_column
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
ExpressionAction ExpressionAction::ordinaryJoin(
|
ExpressionAction ExpressionAction::ordinaryJoin(std::shared_ptr<AnalyzedJoin> join_params, std::shared_ptr<const Join> hash_join)
|
||||||
const ASTTableJoin & join_params,
|
|
||||||
std::shared_ptr<const Join> join_,
|
|
||||||
const Names & join_key_names_left,
|
|
||||||
const Names & join_key_names_right,
|
|
||||||
const NamesAndTypesList & columns_added_by_join_)
|
|
||||||
{
|
{
|
||||||
ExpressionAction a;
|
ExpressionAction a;
|
||||||
a.type = JOIN;
|
a.type = JOIN;
|
||||||
a.join = std::move(join_);
|
a.join_params = join_params;
|
||||||
a.join_kind = join_params.kind;
|
a.join = hash_join;
|
||||||
a.join_key_names_left = join_key_names_left;
|
|
||||||
a.join_key_names_right = join_key_names_right;
|
|
||||||
a.columns_added_by_join = columns_added_by_join_;
|
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -277,51 +269,7 @@ void ExpressionAction::prepare(Block & sample_block, const Settings & settings,
|
|||||||
|
|
||||||
case JOIN:
|
case JOIN:
|
||||||
{
|
{
|
||||||
bool is_null_used_as_default = settings.join_use_nulls;
|
join_params->addJoinedColumnsAndCorrectNullability(sample_block);
|
||||||
bool right_or_full_join = isRightOrFull(join_kind);
|
|
||||||
bool left_or_full_join = isLeftOrFull(join_kind);
|
|
||||||
|
|
||||||
for (auto & col : sample_block)
|
|
||||||
{
|
|
||||||
/// Materialize column.
|
|
||||||
/// Column is not empty if it is constant, but after Join all constants will be materialized.
|
|
||||||
/// So, we need remove constants from header.
|
|
||||||
if (col.column)
|
|
||||||
col.column = nullptr;
|
|
||||||
|
|
||||||
bool make_nullable = is_null_used_as_default && right_or_full_join;
|
|
||||||
|
|
||||||
if (make_nullable && col.type->canBeInsideNullable())
|
|
||||||
col.type = makeNullable(col.type);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto & col : columns_added_by_join)
|
|
||||||
{
|
|
||||||
auto res_type = col.type;
|
|
||||||
|
|
||||||
bool make_nullable = is_null_used_as_default && left_or_full_join;
|
|
||||||
|
|
||||||
if (!make_nullable)
|
|
||||||
{
|
|
||||||
/// Keys from right table are usually not stored in Join, but copied from the left one.
|
|
||||||
/// So, if left key is nullable, let's make right key nullable too.
|
|
||||||
/// Note: for some join types it's not needed and, probably, may be removed.
|
|
||||||
/// Note: changing this code, take into account the implementation in Join.cpp.
|
|
||||||
auto it = std::find(join_key_names_right.begin(), join_key_names_right.end(), col.name);
|
|
||||||
if (it != join_key_names_right.end())
|
|
||||||
{
|
|
||||||
auto pos = it - join_key_names_right.begin();
|
|
||||||
const auto & left_key_name = join_key_names_left[pos];
|
|
||||||
make_nullable = sample_block.getByName(left_key_name).type->isNullable();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (make_nullable && res_type->canBeInsideNullable())
|
|
||||||
res_type = makeNullable(res_type);
|
|
||||||
|
|
||||||
sample_block.insert(ColumnWithTypeAndName(nullptr, res_type, col.name));
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -527,7 +475,7 @@ void ExpressionAction::execute(Block & block, bool dry_run) const
|
|||||||
|
|
||||||
case JOIN:
|
case JOIN:
|
||||||
{
|
{
|
||||||
join->joinBlock(block, join_key_names_left, columns_added_by_join);
|
join->joinBlock(block, *join_params);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -645,9 +593,10 @@ std::string ExpressionAction::toString() const
|
|||||||
|
|
||||||
case JOIN:
|
case JOIN:
|
||||||
ss << "JOIN ";
|
ss << "JOIN ";
|
||||||
for (NamesAndTypesList::const_iterator it = columns_added_by_join.begin(); it != columns_added_by_join.end(); ++it)
|
for (NamesAndTypesList::const_iterator it = join_params->columnsAddedByJoin().begin();
|
||||||
|
it != join_params->columnsAddedByJoin().end(); ++it)
|
||||||
{
|
{
|
||||||
if (it != columns_added_by_join.begin())
|
if (it != join_params->columnsAddedByJoin().begin())
|
||||||
ss << ", ";
|
ss << ", ";
|
||||||
ss << it->name;
|
ss << it->name;
|
||||||
}
|
}
|
||||||
@ -1220,7 +1169,7 @@ BlockInputStreamPtr ExpressionActions::createStreamWithNonJoinedDataIfFullOrRigh
|
|||||||
for (const auto & action : actions)
|
for (const auto & action : actions)
|
||||||
if (action.join && isRightOrFull(action.join->getKind()))
|
if (action.join && isRightOrFull(action.join->getKind()))
|
||||||
return action.join->createStreamWithNonJoinedRows(
|
return action.join->createStreamWithNonJoinedRows(
|
||||||
source_header, action.join_key_names_left, action.columns_added_by_join, max_block_size);
|
source_header, *action.join_params, max_block_size);
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
@ -1267,7 +1216,7 @@ UInt128 ExpressionAction::ActionHash::operator()(const ExpressionAction & action
|
|||||||
hash.update(col);
|
hash.update(col);
|
||||||
break;
|
break;
|
||||||
case JOIN:
|
case JOIN:
|
||||||
for (const auto & col : action.columns_added_by_join)
|
for (const auto & col : action.join_params->columnsAddedByJoin())
|
||||||
hash.update(col.name);
|
hash.update(col.name);
|
||||||
break;
|
break;
|
||||||
case PROJECT:
|
case PROJECT:
|
||||||
@ -1326,9 +1275,7 @@ bool ExpressionAction::operator==(const ExpressionAction & other) const
|
|||||||
&& array_joined_columns == other.array_joined_columns
|
&& array_joined_columns == other.array_joined_columns
|
||||||
&& array_join_is_left == other.array_join_is_left
|
&& array_join_is_left == other.array_join_is_left
|
||||||
&& join == other.join
|
&& join == other.join
|
||||||
&& join_key_names_left == other.join_key_names_left
|
&& AnalyzedJoin::sameJoin(join_params.get(), other.join_params.get())
|
||||||
&& join_key_names_right == other.join_key_names_right
|
|
||||||
&& columns_added_by_join == other.columns_added_by_join
|
|
||||||
&& projection == other.projection
|
&& projection == other.projection
|
||||||
&& is_function_compiled == other.is_function_compiled;
|
&& is_function_compiled == other.is_function_compiled;
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <Core/Settings.h>
|
#include <Core/Settings.h>
|
||||||
#include <DataStreams/IBlockStream_fwd.h>
|
#include <DataStreams/IBlockStream_fwd.h>
|
||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
|
#include <Interpreters/AnalyzedJoin.h>
|
||||||
#include <Common/SipHash.h>
|
#include <Common/SipHash.h>
|
||||||
#include "config_core.h"
|
#include "config_core.h"
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
@ -104,11 +105,8 @@ public:
|
|||||||
bool unaligned_array_join = false;
|
bool unaligned_array_join = false;
|
||||||
|
|
||||||
/// For JOIN
|
/// For JOIN
|
||||||
|
std::shared_ptr<AnalyzedJoin> join_params = nullptr;
|
||||||
std::shared_ptr<const Join> join;
|
std::shared_ptr<const Join> join;
|
||||||
ASTTableJoin::Kind join_kind;
|
|
||||||
Names join_key_names_left;
|
|
||||||
Names join_key_names_right;
|
|
||||||
NamesAndTypesList columns_added_by_join;
|
|
||||||
|
|
||||||
/// For PROJECT.
|
/// For PROJECT.
|
||||||
NamesWithAliases projection;
|
NamesWithAliases projection;
|
||||||
@ -124,9 +122,7 @@ public:
|
|||||||
static ExpressionAction project(const Names & projected_columns_);
|
static ExpressionAction project(const Names & projected_columns_);
|
||||||
static ExpressionAction addAliases(const NamesWithAliases & aliased_columns_);
|
static ExpressionAction addAliases(const NamesWithAliases & aliased_columns_);
|
||||||
static ExpressionAction arrayJoin(const NameSet & array_joined_columns, bool array_join_is_left, const Context & context);
|
static ExpressionAction arrayJoin(const NameSet & array_joined_columns, bool array_join_is_left, const Context & context);
|
||||||
static ExpressionAction ordinaryJoin(const ASTTableJoin & join_params, std::shared_ptr<const Join> join_,
|
static ExpressionAction ordinaryJoin(std::shared_ptr<AnalyzedJoin> join_params, std::shared_ptr<const Join> hash_join);
|
||||||
const Names & join_key_names_left, const Names & join_key_names_right,
|
|
||||||
const NamesAndTypesList & columns_added_by_join_);
|
|
||||||
|
|
||||||
/// Which columns necessary to perform this action.
|
/// Which columns necessary to perform this action.
|
||||||
Names getNeededColumns() const;
|
Names getNeededColumns() const;
|
||||||
|
@ -29,7 +29,6 @@
|
|||||||
#include <Interpreters/PredicateExpressionsOptimizer.h>
|
#include <Interpreters/PredicateExpressionsOptimizer.h>
|
||||||
#include <Interpreters/ExternalDictionaries.h>
|
#include <Interpreters/ExternalDictionaries.h>
|
||||||
#include <Interpreters/Set.h>
|
#include <Interpreters/Set.h>
|
||||||
#include <Interpreters/Join.h>
|
|
||||||
|
|
||||||
#include <AggregateFunctions/AggregateFunctionFactory.h>
|
#include <AggregateFunctions/AggregateFunctionFactory.h>
|
||||||
#include <AggregateFunctions/parseAggregateFunctionParameters.h>
|
#include <AggregateFunctions/parseAggregateFunctionParameters.h>
|
||||||
@ -134,14 +133,8 @@ void ExpressionAnalyzer::analyzeAggregation()
|
|||||||
const ASTTablesInSelectQueryElement * join = select_query->join();
|
const ASTTablesInSelectQueryElement * join = select_query->join();
|
||||||
if (join)
|
if (join)
|
||||||
{
|
{
|
||||||
const auto & table_join = join->table_join->as<ASTTableJoin &>();
|
getRootActions(analyzedJoin().leftKeysList(), true, temp_actions);
|
||||||
if (table_join.using_expression_list)
|
addJoinAction(temp_actions);
|
||||||
getRootActions(table_join.using_expression_list, true, temp_actions);
|
|
||||||
if (table_join.on_expression)
|
|
||||||
for (const auto & key_ast : analyzedJoin().key_asts_left)
|
|
||||||
getRootActions(key_ast, true, temp_actions);
|
|
||||||
|
|
||||||
addJoinAction(table_join, temp_actions);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -298,7 +291,8 @@ void SelectQueryExpressionAnalyzer::makeSetsForIndex(const ASTPtr & node)
|
|||||||
{
|
{
|
||||||
NamesAndTypesList temp_columns = sourceColumns();
|
NamesAndTypesList temp_columns = sourceColumns();
|
||||||
temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end());
|
temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end());
|
||||||
temp_columns.insert(temp_columns.end(), columnsAddedByJoin().begin(), columnsAddedByJoin().end());
|
temp_columns.insert(temp_columns.end(),
|
||||||
|
analyzedJoin().columnsAddedByJoin().begin(), analyzedJoin().columnsAddedByJoin().end());
|
||||||
|
|
||||||
ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context);
|
ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context);
|
||||||
getRootActions(left_in_operand, true, temp_actions);
|
getRootActions(left_in_operand, true, temp_actions);
|
||||||
@ -412,22 +406,10 @@ bool SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActionsChain & cha
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void appendRequiredColumns(
|
|
||||||
NameSet & required_columns, const Block & sample, const Names & key_names_right, const NamesAndTypesList & columns_added_by_join)
|
|
||||||
{
|
|
||||||
for (auto & column : key_names_right)
|
|
||||||
if (!sample.has(column))
|
|
||||||
required_columns.insert(column);
|
|
||||||
|
|
||||||
for (auto & column : columns_added_by_join)
|
|
||||||
if (!sample.has(column.name))
|
|
||||||
required_columns.insert(column.name);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// It's possible to set nullptr as join for only_types mode
|
/// It's possible to set nullptr as join for only_types mode
|
||||||
void ExpressionAnalyzer::addJoinAction(const ASTTableJoin & join_params, ExpressionActionsPtr & actions, JoinPtr join) const
|
void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, JoinPtr join) const
|
||||||
{
|
{
|
||||||
actions->add(ExpressionAction::ordinaryJoin(join_params, std::move(join), analyzedJoin().key_names_left, analyzedJoin().key_names_right, columnsAddedByJoin()));
|
actions->add(ExpressionAction::ordinaryJoin(syntax->analyzed_join, join));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types)
|
bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types)
|
||||||
@ -438,16 +420,11 @@ bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, b
|
|||||||
|
|
||||||
SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join);
|
SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join);
|
||||||
|
|
||||||
ASTPtr left_keys_list = std::make_shared<ASTExpressionList>();
|
|
||||||
left_keys_list->children = analyzedJoin().key_asts_left;
|
|
||||||
|
|
||||||
initChain(chain, sourceColumns());
|
initChain(chain, sourceColumns());
|
||||||
ExpressionActionsChain::Step & step = chain.steps.back();
|
ExpressionActionsChain::Step & step = chain.steps.back();
|
||||||
|
|
||||||
auto & join_params = ast_join->table_join->as<ASTTableJoin &>();
|
getRootActions(analyzedJoin().leftKeysList(), only_types, step.actions);
|
||||||
|
addJoinAction(step.actions, subquery_for_set.join);
|
||||||
getRootActions(left_keys_list, only_types, step.actions);
|
|
||||||
addJoinAction(join_params, step.actions, subquery_for_set.join);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -524,11 +501,9 @@ void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryEle
|
|||||||
Names action_columns = joined_block_actions->getRequiredColumns();
|
Names action_columns = joined_block_actions->getRequiredColumns();
|
||||||
NameSet required_columns(action_columns.begin(), action_columns.end());
|
NameSet required_columns(action_columns.begin(), action_columns.end());
|
||||||
|
|
||||||
auto & analyzed_join = analyzedJoin();
|
analyzedJoin().appendRequiredColumns(joined_block_actions->getSampleBlock(), required_columns);
|
||||||
appendRequiredColumns(
|
|
||||||
required_columns, joined_block_actions->getSampleBlock(), analyzed_join.key_names_right, columnsAddedByJoin());
|
|
||||||
|
|
||||||
auto original_map = analyzed_join.getOriginalColumnsMap(required_columns);
|
auto original_map = analyzedJoin().getOriginalColumnsMap(required_columns);
|
||||||
Names original_columns;
|
Names original_columns;
|
||||||
for (auto & pr : original_map)
|
for (auto & pr : original_map)
|
||||||
original_columns.push_back(pr.second);
|
original_columns.push_back(pr.second);
|
||||||
@ -542,29 +517,16 @@ void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryEle
|
|||||||
joined_block_actions->execute(sample_block);
|
joined_block_actions->execute(sample_block);
|
||||||
|
|
||||||
/// TODO You do not need to set this up when JOIN is only needed on remote servers.
|
/// TODO You do not need to set this up when JOIN is only needed on remote servers.
|
||||||
auto & join_params = join_element.table_join->as<ASTTableJoin &>();
|
subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join);
|
||||||
subquery_for_set.join = std::make_shared<Join>(analyzedJoin().key_names_right, settings.join_use_nulls,
|
|
||||||
settings.size_limits_for_join, join_params.kind, join_params.strictness);
|
|
||||||
subquery_for_set.join->setSampleBlock(sample_block);
|
|
||||||
subquery_for_set.joined_block_actions = joined_block_actions;
|
subquery_for_set.joined_block_actions = joined_block_actions;
|
||||||
}
|
}
|
||||||
|
|
||||||
ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const
|
ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const
|
||||||
{
|
{
|
||||||
/// Create custom expression list with join keys from right table.
|
ASTPtr expression_list = analyzedJoin().rightKeysList();
|
||||||
ASTPtr expression_list = std::make_shared<ASTExpressionList>();
|
Names required_columns = analyzedJoin().requiredJoinedNames();
|
||||||
ASTs & children = expression_list->children;
|
|
||||||
|
|
||||||
if (analyzedJoin().hasOn())
|
auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columnsFromJoinedTable(), required_columns);
|
||||||
for (const auto & join_right_key : analyzedJoin().key_asts_right)
|
|
||||||
children.emplace_back(join_right_key);
|
|
||||||
|
|
||||||
NameSet required_columns_set(analyzedJoin().key_names_right.begin(), analyzedJoin().key_names_right.end());
|
|
||||||
for (const auto & joined_column : columnsAddedByJoin())
|
|
||||||
required_columns_set.insert(joined_column.name);
|
|
||||||
Names required_columns(required_columns_set.begin(), required_columns_set.end());
|
|
||||||
|
|
||||||
auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columns_from_joined_table, required_columns);
|
|
||||||
return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false);
|
return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,9 +121,8 @@ protected:
|
|||||||
SyntaxAnalyzerResultPtr syntax;
|
SyntaxAnalyzerResultPtr syntax;
|
||||||
|
|
||||||
const StoragePtr & storage() const { return syntax->storage; } /// The main table in FROM clause, if exists.
|
const StoragePtr & storage() const { return syntax->storage; } /// The main table in FROM clause, if exists.
|
||||||
const AnalyzedJoin & analyzedJoin() const { return syntax->analyzed_join; }
|
const AnalyzedJoin & analyzedJoin() const { return *syntax->analyzed_join; }
|
||||||
const NamesAndTypesList & sourceColumns() const { return syntax->required_source_columns; }
|
const NamesAndTypesList & sourceColumns() const { return syntax->required_source_columns; }
|
||||||
const NamesAndTypesList & columnsAddedByJoin() const { return syntax->columns_added_by_join; }
|
|
||||||
const std::vector<const ASTFunction *> & aggregates() const { return syntax->aggregates; }
|
const std::vector<const ASTFunction *> & aggregates() const { return syntax->aggregates; }
|
||||||
|
|
||||||
/// Find global subqueries in the GLOBAL IN/JOIN sections. Fills in external_tables.
|
/// Find global subqueries in the GLOBAL IN/JOIN sections. Fills in external_tables.
|
||||||
@ -131,7 +130,7 @@ protected:
|
|||||||
|
|
||||||
void addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool is_left) const;
|
void addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool is_left) const;
|
||||||
|
|
||||||
void addJoinAction(const ASTTableJoin & join_params, ExpressionActionsPtr & actions, JoinPtr join = {}) const;
|
void addJoinAction(ExpressionActionsPtr & actions, JoinPtr join = {}) const;
|
||||||
|
|
||||||
void getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts = false);
|
void getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts = false);
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <DataTypes/DataTypeNullable.h>
|
#include <DataTypes/DataTypeNullable.h>
|
||||||
|
|
||||||
#include <Interpreters/Join.h>
|
#include <Interpreters/Join.h>
|
||||||
|
#include <Interpreters/AnalyzedJoin.h>
|
||||||
#include <Interpreters/joinDispatch.h>
|
#include <Interpreters/joinDispatch.h>
|
||||||
#include <Interpreters/NullableUtils.h>
|
#include <Interpreters/NullableUtils.h>
|
||||||
|
|
||||||
@ -1048,8 +1049,11 @@ void Join::joinGet(Block & block, const String & column_name) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void Join::joinBlock(Block & block, const Names & key_names_left, const NamesAndTypesList & columns_added_by_join) const
|
void Join::joinBlock(Block & block, const AnalyzedJoin & join_params) const
|
||||||
{
|
{
|
||||||
|
const Names & key_names_left = join_params.keyNamesLeft();
|
||||||
|
const NamesAndTypesList & columns_added_by_join = join_params.columnsAddedByJoin();
|
||||||
|
|
||||||
std::shared_lock lock(rwlock);
|
std::shared_lock lock(rwlock);
|
||||||
|
|
||||||
checkTypesOfKeys(block, key_names_left, sample_block_with_keys);
|
checkTypesOfKeys(block, key_names_left, sample_block_with_keys);
|
||||||
@ -1457,10 +1461,11 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, const Names & key_names_left,
|
BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params,
|
||||||
const NamesAndTypesList & columns_added_by_join, UInt64 max_block_size) const
|
UInt64 max_block_size) const
|
||||||
{
|
{
|
||||||
return std::make_shared<NonJoinedBlockInputStream>(*this, left_sample_block, key_names_left, columns_added_by_join, max_block_size);
|
return std::make_shared<NonJoinedBlockInputStream>(*this, left_sample_block,
|
||||||
|
join_params.keyNamesLeft(), join_params.columnsAddedByJoin(), max_block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,6 +26,8 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
|
struct AnalyzedJoin;
|
||||||
|
|
||||||
namespace JoinStuff
|
namespace JoinStuff
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -141,7 +143,7 @@ public:
|
|||||||
/** Join data from the map (that was previously built by calls to insertFromBlock) to the block with data from "left" table.
|
/** Join data from the map (that was previously built by calls to insertFromBlock) to the block with data from "left" table.
|
||||||
* Could be called from different threads in parallel.
|
* Could be called from different threads in parallel.
|
||||||
*/
|
*/
|
||||||
void joinBlock(Block & block, const Names & key_names_left, const NamesAndTypesList & columns_added_by_join) const;
|
void joinBlock(Block & block, const AnalyzedJoin & join_params) const;
|
||||||
|
|
||||||
/// Infer the return type for joinGet function
|
/// Infer the return type for joinGet function
|
||||||
DataTypePtr joinGetReturnType(const String & column_name) const;
|
DataTypePtr joinGetReturnType(const String & column_name) const;
|
||||||
@ -161,8 +163,8 @@ public:
|
|||||||
* Use only after all calls to joinBlock was done.
|
* Use only after all calls to joinBlock was done.
|
||||||
* left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside).
|
* left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside).
|
||||||
*/
|
*/
|
||||||
BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, const Names & key_names_left,
|
BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params,
|
||||||
const NamesAndTypesList & columns_added_by_join, UInt64 max_block_size) const;
|
UInt64 max_block_size) const;
|
||||||
|
|
||||||
/// Number of keys in all built JOIN maps.
|
/// Number of keys in all built JOIN maps.
|
||||||
size_t getTotalRowCount() const;
|
size_t getTotalRowCount() const;
|
||||||
|
@ -489,14 +489,13 @@ void getArrayJoinedColumns(ASTPtr & query, SyntaxAnalyzerResult & result, const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, ASTTableJoin::Kind & join_kind)
|
void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, ASTTableJoin & out_table_join)
|
||||||
{
|
{
|
||||||
const ASTTablesInSelectQueryElement * node = select_query.join();
|
const ASTTablesInSelectQueryElement * node = select_query.join();
|
||||||
if (!node)
|
if (!node)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>();
|
auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>();
|
||||||
join_kind = table_join.kind;
|
|
||||||
|
|
||||||
if (table_join.strictness == ASTTableJoin::Strictness::Unspecified &&
|
if (table_join.strictness == ASTTableJoin::Strictness::Unspecified &&
|
||||||
table_join.kind != ASTTableJoin::Kind::Cross)
|
table_join.kind != ASTTableJoin::Kind::Cross)
|
||||||
@ -509,6 +508,8 @@ void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_defaul
|
|||||||
throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty",
|
throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty",
|
||||||
DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
|
DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
out_table_join = table_join;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find the columns that are obtained by JOIN.
|
/// Find the columns that are obtained by JOIN.
|
||||||
@ -609,8 +610,7 @@ std::vector<const ASTFunction *> getAggregates(const ASTPtr & query)
|
|||||||
/// Calculate which columns are required to execute the expression.
|
/// Calculate which columns are required to execute the expression.
|
||||||
/// Then, delete all other columns from the list of available columns.
|
/// Then, delete all other columns from the list of available columns.
|
||||||
/// After execution, columns will only contain the list of columns needed to read from the table.
|
/// After execution, columns will only contain the list of columns needed to read from the table.
|
||||||
void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns,
|
void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns)
|
||||||
bool make_joined_columns_nullable)
|
|
||||||
{
|
{
|
||||||
/// We caclulate required_source_columns with source_columns modifications and swap them on exit
|
/// We caclulate required_source_columns with source_columns modifications and swap them on exit
|
||||||
required_source_columns = source_columns;
|
required_source_columns = source_columns;
|
||||||
@ -637,8 +637,7 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
|
|||||||
avaliable_columns.insert(name.name);
|
avaliable_columns.insert(name.name);
|
||||||
|
|
||||||
/// Add columns obtained by JOIN (if needed).
|
/// Add columns obtained by JOIN (if needed).
|
||||||
columns_added_by_join.clear();
|
for (const auto & joined_column : analyzed_join->columnsFromJoinedTable())
|
||||||
for (const auto & joined_column : analyzed_join.columns_from_joined_table)
|
|
||||||
{
|
{
|
||||||
auto & name = joined_column.name;
|
auto & name = joined_column.name;
|
||||||
if (avaliable_columns.count(name))
|
if (avaliable_columns.count(name))
|
||||||
@ -647,16 +646,9 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
|
|||||||
if (required.count(name))
|
if (required.count(name))
|
||||||
{
|
{
|
||||||
/// Optimisation: do not add columns needed only in JOIN ON section.
|
/// Optimisation: do not add columns needed only in JOIN ON section.
|
||||||
if (columns_context.nameInclusion(name) > analyzed_join.rightKeyInclusion(name))
|
if (columns_context.nameInclusion(name) > analyzed_join->rightKeyInclusion(name))
|
||||||
{
|
analyzed_join->addJoinedColumn(joined_column);
|
||||||
if (make_joined_columns_nullable)
|
|
||||||
{
|
|
||||||
auto type = joined_column.type->canBeInsideNullable() ? makeNullable(joined_column.type) : joined_column.type;
|
|
||||||
columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, std::move(type)));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
columns_added_by_join.push_back(joined_column);
|
|
||||||
}
|
|
||||||
required.erase(name);
|
required.erase(name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -766,7 +758,7 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
|
|||||||
if (columns_context.has_table_join)
|
if (columns_context.has_table_join)
|
||||||
{
|
{
|
||||||
ss << ", joined columns:";
|
ss << ", joined columns:";
|
||||||
for (const auto & column : analyzed_join.columns_from_joined_table)
|
for (const auto & column : analyzed_join->columnsFromJoinedTable())
|
||||||
ss << " '" << column.name << "'";
|
ss << " '" << column.name << "'";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -798,15 +790,17 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
|
|||||||
storage = context.tryGetTable(db_and_table->database, db_and_table->table);
|
storage = context.tryGetTable(db_and_table->database, db_and_table->table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto & settings = context.getSettingsRef();
|
||||||
|
|
||||||
SyntaxAnalyzerResult result;
|
SyntaxAnalyzerResult result;
|
||||||
result.storage = storage;
|
result.storage = storage;
|
||||||
result.source_columns = source_columns_;
|
result.source_columns = source_columns_;
|
||||||
|
result.analyzed_join = std::make_shared<AnalyzedJoin>(); /// TODO: move to select_query logic
|
||||||
|
result.analyzed_join->join_use_nulls = settings.join_use_nulls;
|
||||||
|
|
||||||
collectSourceColumns(select_query, result.storage, result.source_columns);
|
collectSourceColumns(select_query, result.storage, result.source_columns);
|
||||||
NameSet source_columns_set = removeDuplicateColumns(result.source_columns);
|
NameSet source_columns_set = removeDuplicateColumns(result.source_columns);
|
||||||
|
|
||||||
const auto & settings = context.getSettingsRef();
|
|
||||||
|
|
||||||
Names source_columns_list;
|
Names source_columns_list;
|
||||||
source_columns_list.reserve(result.source_columns.size());
|
source_columns_list.reserve(result.source_columns.size());
|
||||||
for (const auto & type_name : result.source_columns)
|
for (const auto & type_name : result.source_columns)
|
||||||
@ -831,13 +825,13 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
|
|||||||
const auto & joined_expression = node->table_expression->as<ASTTableExpression &>();
|
const auto & joined_expression = node->table_expression->as<ASTTableExpression &>();
|
||||||
DatabaseAndTableWithAlias table(joined_expression, context.getCurrentDatabase());
|
DatabaseAndTableWithAlias table(joined_expression, context.getCurrentDatabase());
|
||||||
|
|
||||||
result.analyzed_join.columns_from_joined_table = getNamesAndTypeListFromTableExpression(joined_expression, context);
|
result.analyzed_join->columns_from_joined_table = getNamesAndTypeListFromTableExpression(joined_expression, context);
|
||||||
result.analyzed_join.deduplicateAndQualifyColumnNames(source_columns_set, table.getQualifiedNamePrefix());
|
result.analyzed_join->deduplicateAndQualifyColumnNames(source_columns_set, table.getQualifiedNamePrefix());
|
||||||
}
|
}
|
||||||
|
|
||||||
translateQualifiedNames(query, *select_query, context,
|
translateQualifiedNames(query, *select_query, context,
|
||||||
(storage ? storage->getColumns().getOrdinary().getNames() : source_columns_list), source_columns_set,
|
(storage ? storage->getColumns().getOrdinary().getNames() : source_columns_list), source_columns_set,
|
||||||
result.analyzed_join.getQualifiedColumnsSet());
|
result.analyzed_join->getQualifiedColumnsSet());
|
||||||
|
|
||||||
/// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting.
|
/// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting.
|
||||||
InJoinSubqueriesPreprocessor(context).visit(query);
|
InJoinSubqueriesPreprocessor(context).visit(query);
|
||||||
@ -872,7 +866,6 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
|
|||||||
/// Optimize if with constant condition after constants was substituted instead of scalar subqueries.
|
/// Optimize if with constant condition after constants was substituted instead of scalar subqueries.
|
||||||
OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query);
|
OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query);
|
||||||
|
|
||||||
bool make_joined_columns_nullable = false;
|
|
||||||
if (select_query)
|
if (select_query)
|
||||||
{
|
{
|
||||||
/// GROUP BY injective function elimination.
|
/// GROUP BY injective function elimination.
|
||||||
@ -893,15 +886,12 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
|
|||||||
/// Push the predicate expression down to the subqueries.
|
/// Push the predicate expression down to the subqueries.
|
||||||
result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize();
|
result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize();
|
||||||
|
|
||||||
ASTTableJoin::Kind join_kind = ASTTableJoin::Kind::Comma;
|
setJoinStrictness(*select_query, settings.join_default_strictness, result.analyzed_join->table_join);
|
||||||
setJoinStrictness(*select_query, settings.join_default_strictness, join_kind);
|
collectJoinedColumns(*result.analyzed_join, *select_query, source_columns_set, result.aliases);
|
||||||
make_joined_columns_nullable = settings.join_use_nulls && isLeftOrFull(join_kind);
|
|
||||||
|
|
||||||
collectJoinedColumns(result.analyzed_join, *select_query, source_columns_set, result.aliases);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
result.aggregates = getAggregates(query);
|
result.aggregates = getAggregates(query);
|
||||||
result.collectUsedColumns(query, additional_source_columns, make_joined_columns_nullable);
|
result.collectUsedColumns(query, additional_source_columns);
|
||||||
return std::make_shared<const SyntaxAnalyzerResult>(result);
|
return std::make_shared<const SyntaxAnalyzerResult>(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,13 +15,11 @@ class ASTFunction;
|
|||||||
struct SyntaxAnalyzerResult
|
struct SyntaxAnalyzerResult
|
||||||
{
|
{
|
||||||
StoragePtr storage;
|
StoragePtr storage;
|
||||||
AnalyzedJoin analyzed_join;
|
std::shared_ptr<AnalyzedJoin> analyzed_join;
|
||||||
|
|
||||||
NamesAndTypesList source_columns;
|
NamesAndTypesList source_columns;
|
||||||
/// Set of columns that are enough to read from the table to evaluate the expression. It does not include joined columns.
|
/// Set of columns that are enough to read from the table to evaluate the expression. It does not include joined columns.
|
||||||
NamesAndTypesList required_source_columns;
|
NamesAndTypesList required_source_columns;
|
||||||
/// Columns will be added to block by JOIN. It's a subset of analyzed_join.columns_from_joined_table with corrected Nullability
|
|
||||||
NamesAndTypesList columns_added_by_join;
|
|
||||||
|
|
||||||
Aliases aliases;
|
Aliases aliases;
|
||||||
std::vector<const ASTFunction *> aggregates;
|
std::vector<const ASTFunction *> aggregates;
|
||||||
@ -42,7 +40,7 @@ struct SyntaxAnalyzerResult
|
|||||||
/// Predicate optimizer overrides the sub queries
|
/// Predicate optimizer overrides the sub queries
|
||||||
bool rewrite_subqueries = false;
|
bool rewrite_subqueries = false;
|
||||||
|
|
||||||
void collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns, bool make_joined_columns_nullable);
|
void collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns);
|
||||||
Names requiredSourceColumns() const { return required_source_columns.getNames(); }
|
Names requiredSourceColumns() const { return required_source_columns.getNames(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -287,12 +287,6 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, Stack & children, Stack & pa
|
|||||||
switch (node.last_processor_status)
|
switch (node.last_processor_status)
|
||||||
{
|
{
|
||||||
case IProcessor::Status::NeedData:
|
case IProcessor::Status::NeedData:
|
||||||
{
|
|
||||||
add_neighbours_to_prepare_queue();
|
|
||||||
try_release_ownership();
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case IProcessor::Status::PortFull:
|
case IProcessor::Status::PortFull:
|
||||||
{
|
{
|
||||||
add_neighbours_to_prepare_queue();
|
add_neighbours_to_prepare_queue();
|
||||||
|
@ -174,7 +174,7 @@ Chunk IRowInputFormat::generate()
|
|||||||
{
|
{
|
||||||
if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0)
|
if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0)
|
||||||
{
|
{
|
||||||
Logger * log = &Logger::get("BlockInputStreamFromRowInputStream");
|
Logger * log = &Logger::get("IRowInputFormat");
|
||||||
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
|
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
#include <Columns/ColumnString.h>
|
#include <Columns/ColumnString.h>
|
||||||
#include <Columns/ColumnNullable.h>
|
#include <Columns/ColumnNullable.h>
|
||||||
#include <Interpreters/castColumn.h>
|
#include <Interpreters/castColumn.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -27,34 +29,28 @@ namespace DB
|
|||||||
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
|
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
|
||||||
extern const int THERE_IS_NO_COLUMN;
|
extern const int THERE_IS_NO_COLUMN;
|
||||||
}
|
}
|
||||||
const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> arrow_type_to_internal_type = {
|
|
||||||
//{arrow::Type::DECIMAL, std::make_shared<DataTypeDecimal>()},
|
|
||||||
{arrow::Type::UINT8, std::make_shared<DataTypeUInt8>()},
|
|
||||||
{arrow::Type::INT8, std::make_shared<DataTypeInt8>()},
|
|
||||||
{arrow::Type::UINT16, std::make_shared<DataTypeUInt16>()},
|
|
||||||
{arrow::Type::INT16, std::make_shared<DataTypeInt16>()},
|
|
||||||
{arrow::Type::UINT32, std::make_shared<DataTypeUInt32>()},
|
|
||||||
{arrow::Type::INT32, std::make_shared<DataTypeInt32>()},
|
|
||||||
{arrow::Type::UINT64, std::make_shared<DataTypeUInt64>()},
|
|
||||||
{arrow::Type::INT64, std::make_shared<DataTypeInt64>()},
|
|
||||||
{arrow::Type::HALF_FLOAT, std::make_shared<DataTypeFloat32>()},
|
|
||||||
{arrow::Type::FLOAT, std::make_shared<DataTypeFloat32>()},
|
|
||||||
{arrow::Type::DOUBLE, std::make_shared<DataTypeFloat64>()},
|
|
||||||
|
|
||||||
{arrow::Type::BOOL, std::make_shared<DataTypeUInt8>()},
|
static const std::initializer_list<std::pair<arrow::Type::type, const char *>> arrow_type_to_internal_type =
|
||||||
//{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
|
{
|
||||||
{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
|
{arrow::Type::UINT8, "UInt8"},
|
||||||
//{arrow::Type::DATE32, std::make_shared<DataTypeDateTime>()},
|
{arrow::Type::INT8, "Int8"},
|
||||||
{arrow::Type::DATE64, std::make_shared<DataTypeDateTime>()},
|
{arrow::Type::UINT16, "UInt16"},
|
||||||
{arrow::Type::TIMESTAMP, std::make_shared<DataTypeDateTime>()},
|
{arrow::Type::INT16, "Int16"},
|
||||||
//{arrow::Type::TIME32, std::make_shared<DataTypeDateTime>()},
|
{arrow::Type::UINT32, "UInt32"},
|
||||||
|
{arrow::Type::INT32, "Int32"},
|
||||||
|
{arrow::Type::UINT64, "UInt64"},
|
||||||
|
{arrow::Type::INT64, "Int64"},
|
||||||
|
{arrow::Type::HALF_FLOAT, "Float32"},
|
||||||
|
{arrow::Type::FLOAT, "Float32"},
|
||||||
|
{arrow::Type::DOUBLE, "Float64"},
|
||||||
|
|
||||||
|
{arrow::Type::BOOL, "UInt8"},
|
||||||
|
{arrow::Type::DATE32, "Date"},
|
||||||
|
{arrow::Type::DATE64, "DateTime"},
|
||||||
|
{arrow::Type::TIMESTAMP, "DateTime"},
|
||||||
|
|
||||||
{arrow::Type::STRING, std::make_shared<DataTypeString>()},
|
{arrow::Type::STRING, "String"},
|
||||||
{arrow::Type::BINARY, std::make_shared<DataTypeString>()},
|
{arrow::Type::BINARY, "String"},
|
||||||
//{arrow::Type::FIXED_SIZE_BINARY, std::make_shared<DataTypeString>()},
|
|
||||||
//{arrow::Type::UUID, std::make_shared<DataTypeString>()},
|
|
||||||
|
|
||||||
|
|
||||||
// TODO: add other types that are convertable to internal ones:
|
// TODO: add other types that are convertable to internal ones:
|
||||||
// 0. ENUM?
|
// 0. ENUM?
|
||||||
@ -308,15 +304,16 @@ namespace DB
|
|||||||
const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
|
const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
|
||||||
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(),
|
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(),
|
||||||
decimal_type->scale());
|
decimal_type->scale());
|
||||||
} else if (arrow_type_to_internal_type.find(arrow_type) != arrow_type_to_internal_type.end())
|
}
|
||||||
|
else if (auto internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
|
||||||
|
[=](auto && elem) { return elem.first == arrow_type; });
|
||||||
|
internal_type_it != arrow_type_to_internal_type.end())
|
||||||
{
|
{
|
||||||
internal_nested_type = arrow_type_to_internal_type.at(arrow_type);
|
internal_nested_type = DataTypeFactory::instance().get(internal_type_it->second);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw Exception
|
throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
|
||||||
{
|
|
||||||
"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
|
|
||||||
+ "\" is not supported for conversion from a " + format_name + " data format",
|
+ "\" is not supported for conversion from a " + format_name + " data format",
|
||||||
ErrorCodes::CANNOT_CONVERT_TYPE};
|
ErrorCodes::CANNOT_CONVERT_TYPE};
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
#include <DataTypes/DataTypeNullable.h>
|
#include <DataTypes/DataTypeNullable.h>
|
||||||
|
#include <DataTypes/DataTypeNothing.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -17,9 +18,9 @@ namespace ErrorCodes
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
CSVRowInputFormat::CSVRowInputFormat(
|
CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_)
|
bool with_names_, const FormatSettings & format_settings_)
|
||||||
: IRowInputFormat(std::move(header_), in_, std::move(params_))
|
: RowInputFormatWithDiagnosticInfo(header_, in_, params_)
|
||||||
, with_names(with_names_)
|
, with_names(with_names_)
|
||||||
, format_settings(format_settings_)
|
, format_settings(format_settings_)
|
||||||
{
|
{
|
||||||
@ -79,72 +80,72 @@ void CSVRowInputFormat::addInputColumn(const String & column_name)
|
|||||||
column_indexes_for_input_fields.emplace_back(column_index);
|
column_indexes_for_input_fields.emplace_back(column_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void skipEndOfLine(ReadBuffer & istr)
|
static void skipEndOfLine(ReadBuffer & in)
|
||||||
{
|
{
|
||||||
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
||||||
|
|
||||||
if (*istr.position() == '\n')
|
if (*in.position() == '\n')
|
||||||
{
|
{
|
||||||
++istr.position();
|
++in.position();
|
||||||
if (!istr.eof() && *istr.position() == '\r')
|
if (!in.eof() && *in.position() == '\r')
|
||||||
++istr.position();
|
++in.position();
|
||||||
}
|
}
|
||||||
else if (*istr.position() == '\r')
|
else if (*in.position() == '\r')
|
||||||
{
|
{
|
||||||
++istr.position();
|
++in.position();
|
||||||
if (!istr.eof() && *istr.position() == '\n')
|
if (!in.eof() && *in.position() == '\n')
|
||||||
++istr.position();
|
++in.position();
|
||||||
else
|
else
|
||||||
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
|
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
|
||||||
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
|
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
|
||||||
}
|
}
|
||||||
else if (!istr.eof())
|
else if (!in.eof())
|
||||||
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
|
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column)
|
static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column)
|
||||||
{
|
{
|
||||||
if (is_last_column)
|
if (is_last_column)
|
||||||
{
|
{
|
||||||
if (istr.eof())
|
if (in.eof())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/// we support the extra delimiter at the end of the line
|
/// we support the extra delimiter at the end of the line
|
||||||
if (*istr.position() == delimiter)
|
if (*in.position() == delimiter)
|
||||||
{
|
{
|
||||||
++istr.position();
|
++in.position();
|
||||||
if (istr.eof())
|
if (in.eof())
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
skipEndOfLine(istr);
|
skipEndOfLine(in);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
assertChar(delimiter, istr);
|
assertChar(delimiter, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Skip `whitespace` symbols allowed in CSV.
|
/// Skip `whitespace` symbols allowed in CSV.
|
||||||
static inline void skipWhitespacesAndTabs(ReadBuffer & buf)
|
static inline void skipWhitespacesAndTabs(ReadBuffer & in)
|
||||||
{
|
{
|
||||||
while (!buf.eof()
|
while (!in.eof()
|
||||||
&& (*buf.position() == ' '
|
&& (*in.position() == ' '
|
||||||
|| *buf.position() == '\t'))
|
|| *in.position() == '\t'))
|
||||||
++buf.position();
|
++in.position();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns)
|
static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns)
|
||||||
{
|
{
|
||||||
String tmp;
|
String tmp;
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
for (size_t i = 0; i < num_columns; ++i)
|
||||||
{
|
{
|
||||||
skipWhitespacesAndTabs(istr);
|
skipWhitespacesAndTabs(in);
|
||||||
readCSVString(tmp, istr, settings);
|
readCSVString(tmp, in, settings);
|
||||||
skipWhitespacesAndTabs(istr);
|
skipWhitespacesAndTabs(in);
|
||||||
|
|
||||||
skipDelimiter(istr, settings.delimiter, i + 1 == num_columns);
|
skipDelimiter(in, settings.delimiter, i + 1 == num_columns);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -156,7 +157,6 @@ void CSVRowInputFormat::readPrefix()
|
|||||||
skipBOMIfExists(in);
|
skipBOMIfExists(in);
|
||||||
|
|
||||||
size_t num_columns = data_types.size();
|
size_t num_columns = data_types.size();
|
||||||
String tmp;
|
|
||||||
auto & header = getPort().getHeader();
|
auto & header = getPort().getHeader();
|
||||||
|
|
||||||
if (with_names)
|
if (with_names)
|
||||||
@ -224,8 +224,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
|
|||||||
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
||||||
{
|
{
|
||||||
const auto & table_column = column_indexes_for_input_fields[file_column];
|
const auto & table_column = column_indexes_for_input_fields[file_column];
|
||||||
const bool is_last_file_column =
|
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
|
||||||
file_column + 1 == column_indexes_for_input_fields.size();
|
|
||||||
|
|
||||||
if (table_column)
|
if (table_column)
|
||||||
{
|
{
|
||||||
@ -267,71 +266,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
||||||
String CSVRowInputFormat::getDiagnosticInfo()
|
|
||||||
{
|
|
||||||
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
|
||||||
return {};
|
|
||||||
|
|
||||||
WriteBufferFromOwnString out;
|
|
||||||
|
|
||||||
auto & header = getPort().getHeader();
|
|
||||||
MutableColumns columns = header.cloneEmptyColumns();
|
|
||||||
|
|
||||||
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
|
|
||||||
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t max_length_of_column_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
|
|
||||||
max_length_of_column_name = header.safeGetByPosition(i).name.size();
|
|
||||||
|
|
||||||
size_t max_length_of_data_type_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
|
||||||
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
|
|
||||||
|
|
||||||
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
|
|
||||||
|
|
||||||
if (pos_of_prev_row)
|
|
||||||
{
|
|
||||||
in.position() = pos_of_prev_row;
|
|
||||||
|
|
||||||
out << "\nRow " << (row_num - 1) << ":\n";
|
|
||||||
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!pos_of_current_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
in.position() = pos_of_current_row;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\nRow " << row_num << ":\n";
|
|
||||||
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** gcc-7 generates wrong code with optimization level greater than 1.
|
|
||||||
* See tests: dbms/src/IO/tests/write_int.cpp
|
|
||||||
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
|
|
||||||
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
|
|
||||||
* Nevertheless, we don't need high optimization of this function.
|
|
||||||
*/
|
|
||||||
bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
|
||||||
{
|
{
|
||||||
const char delimiter = format_settings.csv.delimiter;
|
const char delimiter = format_settings.csv.delimiter;
|
||||||
|
|
||||||
@ -345,100 +280,19 @@ bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumn
|
|||||||
|
|
||||||
if (column_indexes_for_input_fields[file_column].has_value())
|
if (column_indexes_for_input_fields[file_column].has_value())
|
||||||
{
|
{
|
||||||
const auto & table_column = *column_indexes_for_input_fields[file_column];
|
|
||||||
const auto & current_column_type = data_types[table_column];
|
|
||||||
const bool is_last_file_column =
|
|
||||||
file_column + 1 == column_indexes_for_input_fields.size();
|
|
||||||
const bool at_delimiter = !in.eof() && *in.position() == delimiter;
|
|
||||||
const bool at_last_column_line_end = is_last_file_column
|
|
||||||
&& (in.eof() || *in.position() == '\n' || *in.position() == '\r');
|
|
||||||
|
|
||||||
auto & header = getPort().getHeader();
|
auto & header = getPort().getHeader();
|
||||||
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
size_t col_idx = column_indexes_for_input_fields[file_column].value();
|
||||||
<< "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ')
|
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
|
||||||
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
|
out, file_column))
|
||||||
|
|
||||||
if (format_settings.csv.empty_as_default
|
|
||||||
&& (at_delimiter || at_last_column_line_end))
|
|
||||||
{
|
|
||||||
columns[table_column]->insertDefault();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
BufferBase::Position prev_position = in.position();
|
|
||||||
BufferBase::Position curr_position = in.position();
|
|
||||||
std::exception_ptr exception;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
skipWhitespacesAndTabs(in);
|
|
||||||
prev_position = in.position();
|
|
||||||
readField(*columns[table_column], current_column_type, is_last_file_column, table_column);
|
|
||||||
curr_position = in.position();
|
|
||||||
skipWhitespacesAndTabs(in);
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
exception = std::current_exception();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (curr_position < prev_position)
|
|
||||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
|
|
||||||
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
|
|
||||||
{
|
|
||||||
/// An empty string instead of a value.
|
|
||||||
if (curr_position == prev_position)
|
|
||||||
{
|
|
||||||
out << "ERROR: text ";
|
|
||||||
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
|
|
||||||
out << " is not like " << current_column_type->getName() << "\n";
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
out << "parsed text: ";
|
|
||||||
verbosePrintString(prev_position, curr_position, out);
|
|
||||||
|
|
||||||
if (exception)
|
|
||||||
{
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
else
|
|
||||||
out << "ERROR\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->haveMaximumSizeOfValue()
|
|
||||||
&& *curr_position != '\n' && *curr_position != '\r'
|
|
||||||
&& *curr_position != delimiter)
|
|
||||||
{
|
|
||||||
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
|
|
||||||
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
||||||
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
|
||||||
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
|
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
|
||||||
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
|
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
|
||||||
|
return false;
|
||||||
String tmp;
|
|
||||||
readCSVString(tmp, in, format_settings.csv);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Delimiters
|
/// Delimiters
|
||||||
@ -502,15 +356,26 @@ void CSVRowInputFormat::syncAfterError()
|
|||||||
skipToNextLineOrEOF(in);
|
skipToNextLineOrEOF(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CSVRowInputFormat::updateDiagnosticInfo()
|
void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||||
{
|
{
|
||||||
++row_num;
|
skipWhitespacesAndTabs(in);
|
||||||
|
prev_pos = in.position();
|
||||||
|
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
if (column_indexes_for_input_fields[file_column])
|
||||||
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset();
|
{
|
||||||
|
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
|
||||||
|
if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[file_column]))
|
||||||
|
column.insertDefault();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
String tmp;
|
||||||
|
readCSVString(tmp, in, format_settings.csv);
|
||||||
|
}
|
||||||
|
|
||||||
pos_of_prev_row = pos_of_current_row;
|
curr_pos = in.position();
|
||||||
pos_of_current_row = in.position();
|
skipWhitespacesAndTabs(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
|
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
|
||||||
@ -563,7 +428,7 @@ void registerInputFormatProcessorCSV(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<CSVRowInputFormat>(buf, sample, std::move(params), with_names, settings);
|
return std::make_shared<CSVRowInputFormat>(sample, buf, params, with_names, settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,40 +1,38 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
#include <Core/Block.h>
|
#include <Core/Block.h>
|
||||||
#include <Processors/Formats/IRowInputFormat.h>
|
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
|
||||||
#include <Formats/FormatSettings.h>
|
#include <Formats/FormatSettings.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
/** A stream for inputting data in csv format.
|
/** A stream for inputting data in csv format.
|
||||||
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
|
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
|
||||||
*/
|
*/
|
||||||
class CSVRowInputFormat : public IRowInputFormat
|
class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
/** with_names - in the first line the header with column names
|
/** with_names - in the first line the header with column names
|
||||||
* with_types - on the next line header with type names
|
|
||||||
*/
|
*/
|
||||||
CSVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_);
|
CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
|
bool with_names_, const FormatSettings & format_settings_);
|
||||||
|
|
||||||
String getName() const override { return "CSVRowInputFormat"; }
|
String getName() const override { return "CSVRowInputFormat"; }
|
||||||
|
|
||||||
bool readRow(MutableColumns & columns, RowReadExtension &) override;
|
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
|
||||||
void readPrefix() override;
|
void readPrefix() override;
|
||||||
bool allowSyncAfterError() const override { return true; }
|
bool allowSyncAfterError() const override { return true; }
|
||||||
void syncAfterError() override;
|
void syncAfterError() override;
|
||||||
|
|
||||||
std::string getDiagnosticInfo() override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool with_names;
|
bool with_names;
|
||||||
DataTypes data_types;
|
|
||||||
|
|
||||||
const FormatSettings format_settings;
|
const FormatSettings format_settings;
|
||||||
|
DataTypes data_types;
|
||||||
|
|
||||||
using IndexesMap = std::unordered_map<String, size_t>;
|
using IndexesMap = std::unordered_map<String, size_t>;
|
||||||
IndexesMap column_indexes_by_names;
|
IndexesMap column_indexes_by_names;
|
||||||
@ -43,7 +41,7 @@ private:
|
|||||||
using OptionalIndexes = std::vector<std::optional<size_t>>;
|
using OptionalIndexes = std::vector<std::optional<size_t>>;
|
||||||
OptionalIndexes column_indexes_for_input_fields;
|
OptionalIndexes column_indexes_for_input_fields;
|
||||||
|
|
||||||
/// Tracks which colums we have read in a single read() call.
|
/// Tracks which columns we have read in a single read() call.
|
||||||
/// For columns that are never read, it is initialized to false when we
|
/// For columns that are never read, it is initialized to false when we
|
||||||
/// read the file header, and never changed afterwards.
|
/// read the file header, and never changed afterwards.
|
||||||
/// For other columns, it is updated on each read() call.
|
/// For other columns, it is updated on each read() call.
|
||||||
@ -55,26 +53,19 @@ private:
|
|||||||
|
|
||||||
void addInputColumn(const String & column_name);
|
void addInputColumn(const String & column_name);
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||||
size_t row_num = 0;
|
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
|
||||||
/// How many bytes were read, not counting those that are still in the buffer.
|
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
|
||||||
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
{
|
||||||
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
|
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter;
|
||||||
|
}
|
||||||
char * pos_of_current_row = nullptr;
|
|
||||||
char * pos_of_prev_row = nullptr;
|
|
||||||
|
|
||||||
/// For setting input_format_null_as_default
|
/// For setting input_format_null_as_default
|
||||||
DataTypes nullable_types;
|
DataTypes nullable_types;
|
||||||
MutableColumns nullable_columns;
|
MutableColumns nullable_columns;
|
||||||
OptionalIndexes column_idx_to_nullable_column_idx;
|
OptionalIndexes column_idx_to_nullable_column_idx;
|
||||||
|
|
||||||
void updateDiagnosticInfo();
|
|
||||||
|
|
||||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
|
||||||
|
|
||||||
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
|
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
|
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
|
||||||
#include <Formats/verbosePrintString.h>
|
#include <Formats/verbosePrintString.h>
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
|
#include <DataTypes/DataTypeNothing.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -16,23 +17,23 @@ namespace ErrorCodes
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void skipTSVRow(ReadBuffer & istr, const size_t num_columns)
|
static void skipTSVRow(ReadBuffer & in, const size_t num_columns)
|
||||||
{
|
{
|
||||||
NullSink null_sink;
|
NullSink null_sink;
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
for (size_t i = 0; i < num_columns; ++i)
|
||||||
{
|
{
|
||||||
readEscapedStringInto(null_sink, istr);
|
readEscapedStringInto(null_sink, in);
|
||||||
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
|
assertChar(i == num_columns - 1 ? '\n' : '\t', in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Check for a common error case - usage of Windows line feed.
|
/** Check for a common error case - usage of Windows line feed.
|
||||||
*/
|
*/
|
||||||
static void checkForCarriageReturn(ReadBuffer & istr)
|
static void checkForCarriageReturn(ReadBuffer & in)
|
||||||
{
|
{
|
||||||
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
|
if (in.position()[0] == '\r' || (in.position() != in.buffer().begin() && in.position()[-1] == '\r'))
|
||||||
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
|
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
|
||||||
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
|
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
|
||||||
" You must transform your file to Unix format."
|
" You must transform your file to Unix format."
|
||||||
@ -41,9 +42,9 @@ static void checkForCarriageReturn(ReadBuffer & istr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(
|
TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_)
|
bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||||
: IRowInputFormat(std::move(header_), in_, std::move(params_)), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
|
: RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
|
||||||
{
|
{
|
||||||
auto & sample = getPort().getHeader();
|
auto & sample = getPort().getHeader();
|
||||||
size_t num_columns = sample.columns();
|
size_t num_columns = sample.columns();
|
||||||
@ -173,9 +174,9 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
|
|||||||
|
|
||||||
updateDiagnosticInfo();
|
updateDiagnosticInfo();
|
||||||
|
|
||||||
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
||||||
{
|
{
|
||||||
const auto & column_index = column_indexes_for_input_fields[input_position];
|
const auto & column_index = column_indexes_for_input_fields[file_column];
|
||||||
if (column_index)
|
if (column_index)
|
||||||
{
|
{
|
||||||
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], in, format_settings);
|
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], in, format_settings);
|
||||||
@ -187,7 +188,7 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// skip separators
|
/// skip separators
|
||||||
if (input_position + 1 < column_indexes_for_input_fields.size())
|
if (file_column + 1 < column_indexes_for_input_fields.size())
|
||||||
{
|
{
|
||||||
assertChar('\t', in);
|
assertChar('\t', in);
|
||||||
}
|
}
|
||||||
@ -205,160 +206,35 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
||||||
String TabSeparatedRowInputFormat::getDiagnosticInfo()
|
|
||||||
{
|
{
|
||||||
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
||||||
return {};
|
|
||||||
|
|
||||||
auto & header = getPort().getHeader();
|
|
||||||
WriteBufferFromOwnString out;
|
|
||||||
MutableColumns columns = header.cloneEmptyColumns();
|
|
||||||
|
|
||||||
/// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
|
|
||||||
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
|
||||||
{
|
{
|
||||||
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
if (file_column == 0 && in.eof())
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t max_length_of_column_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
|
|
||||||
max_length_of_column_name = header.safeGetByPosition(i).name.size();
|
|
||||||
|
|
||||||
size_t max_length_of_data_type_name = 0;
|
|
||||||
for (size_t i = 0; i < header.columns(); ++i)
|
|
||||||
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
|
||||||
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
|
|
||||||
|
|
||||||
/// Roll back the cursor to the beginning of the previous or current line and pars all over again. But now we derive detailed information.
|
|
||||||
|
|
||||||
if (pos_of_prev_row)
|
|
||||||
{
|
|
||||||
in.position() = pos_of_prev_row;
|
|
||||||
|
|
||||||
out << "\nRow " << (row_num - 1) << ":\n";
|
|
||||||
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!pos_of_current_row)
|
|
||||||
{
|
|
||||||
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
in.position() = pos_of_current_row;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\nRow " << row_num << ":\n";
|
|
||||||
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
return out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
|
||||||
{
|
|
||||||
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
|
|
||||||
{
|
|
||||||
if (input_position == 0 && in.eof())
|
|
||||||
{
|
{
|
||||||
out << "<End of stream>\n";
|
out << "<End of stream>\n";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (column_indexes_for_input_fields[input_position].has_value())
|
if (column_indexes_for_input_fields[file_column].has_value())
|
||||||
{
|
{
|
||||||
const auto & column_index = *column_indexes_for_input_fields[input_position];
|
auto & header = getPort().getHeader();
|
||||||
const auto & current_column_type = data_types[column_index];
|
size_t col_idx = column_indexes_for_input_fields[file_column].value();
|
||||||
|
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
|
||||||
const auto & header = getPort().getHeader();
|
out, file_column))
|
||||||
|
|
||||||
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
|
|
||||||
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
|
|
||||||
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
|
|
||||||
|
|
||||||
auto prev_position = in.position();
|
|
||||||
std::exception_ptr exception;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
current_column_type->deserializeAsTextEscaped(*columns[column_index], in, format_settings);
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
exception = std::current_exception();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto curr_position = in.position();
|
|
||||||
|
|
||||||
if (curr_position < prev_position)
|
|
||||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
|
|
||||||
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
|
|
||||||
{
|
|
||||||
/// An empty string instead of a value.
|
|
||||||
if (curr_position == prev_position)
|
|
||||||
{
|
|
||||||
out << "ERROR: text ";
|
|
||||||
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
|
|
||||||
out << " is not like " << current_column_type->getName() << "\n";
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
out << "parsed text: ";
|
|
||||||
verbosePrintString(prev_position, curr_position, out);
|
|
||||||
|
|
||||||
if (exception)
|
|
||||||
{
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
else
|
|
||||||
out << "ERROR\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->haveMaximumSizeOfValue())
|
|
||||||
{
|
|
||||||
if (*curr_position != '\n' && *curr_position != '\t')
|
|
||||||
{
|
|
||||||
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
|
|
||||||
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (current_column_type->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (current_column_type->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
||||||
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
|
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
|
||||||
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
|
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
|
||||||
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
|
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
|
||||||
|
return false;
|
||||||
NullSink null_sink;
|
|
||||||
readEscapedStringInto(null_sink, in);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Delimiters
|
/// Delimiters
|
||||||
if (input_position + 1 == column_indexes_for_input_fields.size())
|
if (file_column + 1 == column_indexes_for_input_fields.size())
|
||||||
{
|
{
|
||||||
if (!in.eof())
|
if (!in.eof())
|
||||||
{
|
{
|
||||||
@ -401,7 +277,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
|
|||||||
{
|
{
|
||||||
out << "ERROR: Line feed found where tab is expected."
|
out << "ERROR: Line feed found where tab is expected."
|
||||||
" It's like your file has less columns than expected.\n"
|
" It's like your file has less columns than expected.\n"
|
||||||
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
|
"And if your file have right number of columns, "
|
||||||
|
"maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
|
||||||
}
|
}
|
||||||
else if (*in.position() == '\r')
|
else if (*in.position() == '\r')
|
||||||
{
|
{
|
||||||
@ -421,6 +298,19 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||||
|
{
|
||||||
|
prev_pos = in.position();
|
||||||
|
if (column_indexes_for_input_fields[file_column])
|
||||||
|
type->deserializeAsTextEscaped(column, in, format_settings);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
NullSink null_sink;
|
||||||
|
readEscapedStringInto(null_sink, in);
|
||||||
|
}
|
||||||
|
curr_pos = in.position();
|
||||||
|
}
|
||||||
|
|
||||||
void TabSeparatedRowInputFormat::syncAfterError()
|
void TabSeparatedRowInputFormat::syncAfterError()
|
||||||
{
|
{
|
||||||
@ -428,18 +318,6 @@ void TabSeparatedRowInputFormat::syncAfterError()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowInputFormat::updateDiagnosticInfo()
|
|
||||||
{
|
|
||||||
++row_num;
|
|
||||||
|
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
|
||||||
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset();
|
|
||||||
|
|
||||||
pos_of_prev_row = pos_of_current_row;
|
|
||||||
pos_of_current_row = in.position();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
|
void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
|
||||||
{
|
{
|
||||||
for (auto name : {"TabSeparated", "TSV"})
|
for (auto name : {"TabSeparated", "TSV"})
|
||||||
@ -451,7 +329,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, false, false, std::move(params), settings);
|
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, false, false, settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -464,7 +342,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, true, false, std::move(params), settings);
|
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, false, settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -477,7 +355,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, true, true, std::move(params), settings);
|
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, true, settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,25 +2,22 @@
|
|||||||
|
|
||||||
#include <Core/Block.h>
|
#include <Core/Block.h>
|
||||||
#include <Formats/FormatSettings.h>
|
#include <Formats/FormatSettings.h>
|
||||||
#include <Processors/Formats/IRowInputFormat.h>
|
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream to input data in tsv format.
|
/** A stream to input data in tsv format.
|
||||||
*/
|
*/
|
||||||
class TabSeparatedRowInputFormat : public IRowInputFormat
|
class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
/** with_names - the first line is the header with the names of the columns
|
/** with_names - the first line is the header with the names of the columns
|
||||||
* with_types - on the next line header with type names
|
* with_types - on the next line header with type names
|
||||||
*/
|
*/
|
||||||
TabSeparatedRowInputFormat(
|
TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_);
|
bool with_names_, bool with_types_, const FormatSettings & format_settings_);
|
||||||
|
|
||||||
String getName() const override { return "TabSeparatedRowInputFormat"; }
|
String getName() const override { return "TabSeparatedRowInputFormat"; }
|
||||||
|
|
||||||
@ -29,8 +26,6 @@ public:
|
|||||||
bool allowSyncAfterError() const override { return true; }
|
bool allowSyncAfterError() const override { return true; }
|
||||||
void syncAfterError() override;
|
void syncAfterError() override;
|
||||||
|
|
||||||
std::string getDiagnosticInfo() override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool with_names;
|
bool with_names;
|
||||||
bool with_types;
|
bool with_types;
|
||||||
@ -50,21 +45,10 @@ private:
|
|||||||
void setupAllColumnsByTableSchema();
|
void setupAllColumnsByTableSchema();
|
||||||
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
|
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||||
|
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
size_t row_num = 0;
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
|
||||||
|
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
|
||||||
/// How many bytes were read, not counting those still in the buffer.
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
|
||||||
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
|
|
||||||
|
|
||||||
char * pos_of_current_row = nullptr;
|
|
||||||
char * pos_of_prev_row = nullptr;
|
|
||||||
|
|
||||||
void updateDiagnosticInfo();
|
|
||||||
|
|
||||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
243
dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp
Normal file
243
dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
#include <Processors/Formats/Impl/TemplateBlockOutputFormat.h>
|
||||||
|
#include <Formats/FormatFactory.h>
|
||||||
|
#include <IO/WriteHelpers.h>
|
||||||
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int SYNTAX_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_)
|
||||||
|
: IOutputFormat(header_, out_), settings(settings_)
|
||||||
|
{
|
||||||
|
auto & sample = getPort(PortKind::Main).getHeader();
|
||||||
|
size_t columns = sample.columns();
|
||||||
|
types.resize(columns);
|
||||||
|
for (size_t i = 0; i < columns; ++i)
|
||||||
|
types[i] = sample.safeGetByPosition(i).type;
|
||||||
|
|
||||||
|
/// Parse format string for whole output
|
||||||
|
static const String default_format("${data}");
|
||||||
|
const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format;
|
||||||
|
format = ParsedTemplateFormatString(format_str, [&](const String & partName)
|
||||||
|
{
|
||||||
|
return static_cast<size_t>(stringToOutputPart(partName));
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Validate format string for whole output
|
||||||
|
size_t data_idx = format.format_idx_to_column_idx.size() + 1;
|
||||||
|
for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i)
|
||||||
|
{
|
||||||
|
if (!format.format_idx_to_column_idx[i])
|
||||||
|
format.throwInvalidFormat("Output part name cannot be empty, it's a bug.", i);
|
||||||
|
switch (static_cast<OutputPart>(*format.format_idx_to_column_idx[i]))
|
||||||
|
{
|
||||||
|
case OutputPart::Data:
|
||||||
|
data_idx = i;
|
||||||
|
[[fallthrough]];
|
||||||
|
case OutputPart::Totals:
|
||||||
|
case OutputPart::ExtremesMin:
|
||||||
|
case OutputPart::ExtremesMax:
|
||||||
|
if (format.formats[i] != ColumnFormat::None)
|
||||||
|
format.throwInvalidFormat("Serialization type for data, totals, min and max must be empty or None", i);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (format.formats[i] == ColumnFormat::None)
|
||||||
|
format.throwInvalidFormat("Serialization type for output part rows, rows_before_limit, time, "
|
||||||
|
"rows_read or bytes_read is not specified", i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (data_idx != 0)
|
||||||
|
format.throwInvalidFormat("${data} must be the first output part", 0);
|
||||||
|
|
||||||
|
/// Parse format string for rows
|
||||||
|
row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName)
|
||||||
|
{
|
||||||
|
return sample.getPositionByName(colName);
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Validate format string for rows
|
||||||
|
if (row_format.delimiters.size() == 1)
|
||||||
|
row_format.throwInvalidFormat("No columns specified", 0);
|
||||||
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
||||||
|
{
|
||||||
|
if (!row_format.format_idx_to_column_idx[i])
|
||||||
|
row_format.throwInvalidFormat("Cannot skip format field for output, it's a bug.", i);
|
||||||
|
if (row_format.formats[i] == ColumnFormat::None)
|
||||||
|
row_format.throwInvalidFormat("Serialization type for file column is not specified", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part)
|
||||||
|
{
|
||||||
|
if (part == "data")
|
||||||
|
return OutputPart::Data;
|
||||||
|
else if (part == "totals")
|
||||||
|
return OutputPart::Totals;
|
||||||
|
else if (part == "min")
|
||||||
|
return OutputPart::ExtremesMin;
|
||||||
|
else if (part == "max")
|
||||||
|
return OutputPart::ExtremesMax;
|
||||||
|
else if (part == "rows")
|
||||||
|
return OutputPart::Rows;
|
||||||
|
else if (part == "rows_before_limit")
|
||||||
|
return OutputPart::RowsBeforeLimit;
|
||||||
|
else if (part == "time")
|
||||||
|
return OutputPart::TimeElapsed;
|
||||||
|
else if (part == "rows_read")
|
||||||
|
return OutputPart::RowsRead;
|
||||||
|
else if (part == "bytes_read")
|
||||||
|
return OutputPart::BytesRead;
|
||||||
|
else
|
||||||
|
throw Exception("Unknown output part " + part, ErrorCodes::SYNTAX_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num)
|
||||||
|
{
|
||||||
|
size_t columns = row_format.format_idx_to_column_idx.size();
|
||||||
|
for (size_t j = 0; j < columns; ++j)
|
||||||
|
{
|
||||||
|
writeString(row_format.delimiters[j], out);
|
||||||
|
|
||||||
|
size_t col_idx = *row_format.format_idx_to_column_idx[j];
|
||||||
|
serializeField(*chunk.getColumns()[col_idx], *types[col_idx], row_num, row_format.formats[j]);
|
||||||
|
}
|
||||||
|
writeString(row_format.delimiters[columns], out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format)
|
||||||
|
{
|
||||||
|
switch (col_format)
|
||||||
|
{
|
||||||
|
case ColumnFormat::Escaped:
|
||||||
|
type.serializeAsTextEscaped(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Quoted:
|
||||||
|
type.serializeAsTextQuoted(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Csv:
|
||||||
|
type.serializeAsTextCSV(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Json:
|
||||||
|
type.serializeAsTextJSON(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Xml:
|
||||||
|
type.serializeAsTextXML(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Raw:
|
||||||
|
type.serializeAsText(column, row_num, out, settings);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
__builtin_unreachable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename U, typename V> void TemplateBlockOutputFormat::writeValue(U value, ColumnFormat col_format)
|
||||||
|
{
|
||||||
|
auto type = std::make_unique<V>();
|
||||||
|
auto col = type->createColumn();
|
||||||
|
col->insert(value);
|
||||||
|
serializeField(*col, *type, 0, col_format);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateBlockOutputFormat::consume(Chunk chunk)
|
||||||
|
{
|
||||||
|
doWritePrefix();
|
||||||
|
|
||||||
|
size_t rows = chunk.getNumRows();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < rows; ++i)
|
||||||
|
{
|
||||||
|
if (row_count)
|
||||||
|
writeString(settings.template_settings.row_between_delimiter, out);
|
||||||
|
|
||||||
|
writeRow(chunk, i);
|
||||||
|
++row_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateBlockOutputFormat::doWritePrefix()
|
||||||
|
{
|
||||||
|
if (need_write_prefix)
|
||||||
|
{
|
||||||
|
writeString(format.delimiters.front(), out);
|
||||||
|
need_write_prefix = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateBlockOutputFormat::finalize()
|
||||||
|
{
|
||||||
|
if (finalized)
|
||||||
|
return;
|
||||||
|
|
||||||
|
doWritePrefix();
|
||||||
|
|
||||||
|
size_t parts = format.format_idx_to_column_idx.size();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < parts; ++i)
|
||||||
|
{
|
||||||
|
auto type = std::make_shared<DataTypeUInt64>();
|
||||||
|
ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp"));
|
||||||
|
switch (static_cast<OutputPart>(*format.format_idx_to_column_idx[i]))
|
||||||
|
{
|
||||||
|
case OutputPart::Totals:
|
||||||
|
if (!totals)
|
||||||
|
format.throwInvalidFormat("Cannot print totals for this request", i);
|
||||||
|
writeRow(totals, 0);
|
||||||
|
break;
|
||||||
|
case OutputPart::ExtremesMin:
|
||||||
|
if (!extremes)
|
||||||
|
format.throwInvalidFormat("Cannot print extremes for this request", i);
|
||||||
|
writeRow(extremes, 0);
|
||||||
|
break;
|
||||||
|
case OutputPart::ExtremesMax:
|
||||||
|
if (!extremes)
|
||||||
|
format.throwInvalidFormat("Cannot print extremes for this request", i);
|
||||||
|
writeRow(extremes, 1);
|
||||||
|
break;
|
||||||
|
case OutputPart::Rows:
|
||||||
|
writeValue<size_t, DataTypeUInt64>(row_count, format.formats[i]);
|
||||||
|
break;
|
||||||
|
case OutputPart::RowsBeforeLimit:
|
||||||
|
if (!rows_before_limit_set)
|
||||||
|
format.throwInvalidFormat("Cannot print rows_before_limit for this request", i);
|
||||||
|
writeValue<size_t, DataTypeUInt64>(rows_before_limit, format.formats[i]);
|
||||||
|
break;
|
||||||
|
case OutputPart::TimeElapsed:
|
||||||
|
writeValue<double, DataTypeFloat64>(watch.elapsedSeconds(), format.formats[i]);
|
||||||
|
break;
|
||||||
|
case OutputPart::RowsRead:
|
||||||
|
writeValue<size_t, DataTypeUInt64>(progress.read_rows.load(), format.formats[i]);
|
||||||
|
break;
|
||||||
|
case OutputPart::BytesRead:
|
||||||
|
writeValue<size_t, DataTypeUInt64>(progress.read_bytes.load(), format.formats[i]);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writeString(format.delimiters[i + 1], out);
|
||||||
|
}
|
||||||
|
|
||||||
|
finalized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void registerOutputFormatProcessorTemplate(FormatFactory & factory)
|
||||||
|
{
|
||||||
|
factory.registerOutputFormatProcessor("Template", [](
|
||||||
|
WriteBuffer & buf,
|
||||||
|
const Block & sample,
|
||||||
|
const Context &,
|
||||||
|
FormatFactory::WriteCallback,
|
||||||
|
const FormatSettings & settings)
|
||||||
|
{
|
||||||
|
return std::make_shared<TemplateBlockOutputFormat>(sample, buf, settings);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
68
dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h
Normal file
68
dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/Stopwatch.h>
|
||||||
|
#include <Core/Block.h>
|
||||||
|
#include <Formats/FormatSettings.h>
|
||||||
|
#include <Processors/Formats/IOutputFormat.h>
|
||||||
|
#include <Formats/ParsedTemplateFormatString.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
class TemplateBlockOutputFormat : public IOutputFormat
|
||||||
|
{
|
||||||
|
using ColumnFormat = ParsedTemplateFormatString::ColumnFormat;
|
||||||
|
public:
|
||||||
|
TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_);
|
||||||
|
|
||||||
|
String getName() const override { return "TemplateBlockOutputFormat"; }
|
||||||
|
|
||||||
|
void doWritePrefix() override;
|
||||||
|
|
||||||
|
void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; rows_before_limit_set = true; }
|
||||||
|
void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void consume(Chunk chunk) override;
|
||||||
|
void consumeTotals(Chunk chunk) override { totals = std::move(chunk); }
|
||||||
|
void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); }
|
||||||
|
void finalize() override;
|
||||||
|
|
||||||
|
enum class OutputPart : size_t
|
||||||
|
{
|
||||||
|
Data,
|
||||||
|
Totals,
|
||||||
|
ExtremesMin,
|
||||||
|
ExtremesMax,
|
||||||
|
Rows,
|
||||||
|
RowsBeforeLimit,
|
||||||
|
TimeElapsed,
|
||||||
|
RowsRead,
|
||||||
|
BytesRead
|
||||||
|
};
|
||||||
|
|
||||||
|
OutputPart stringToOutputPart(const String & part);
|
||||||
|
void writeRow(const Chunk & chunk, size_t row_num);
|
||||||
|
void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format);
|
||||||
|
template <typename U, typename V> void writeValue(U value, ColumnFormat col_format);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const FormatSettings settings;
|
||||||
|
DataTypes types;
|
||||||
|
|
||||||
|
ParsedTemplateFormatString format;
|
||||||
|
ParsedTemplateFormatString row_format;
|
||||||
|
|
||||||
|
size_t rows_before_limit = 0;
|
||||||
|
bool rows_before_limit_set = false;
|
||||||
|
Chunk totals;
|
||||||
|
Chunk extremes;
|
||||||
|
Progress progress;
|
||||||
|
Stopwatch watch;
|
||||||
|
|
||||||
|
size_t row_count = 0;
|
||||||
|
bool need_write_prefix = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
520
dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
Normal file
520
dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
Normal file
@ -0,0 +1,520 @@
|
|||||||
|
#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
|
||||||
|
#include <Formats/FormatFactory.h>
|
||||||
|
#include <Formats/verbosePrintString.h>
|
||||||
|
#include <IO/Operators.h>
|
||||||
|
#include <DataTypes/DataTypeNothing.h>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int ATTEMPT_TO_READ_AFTER_EOF;
|
||||||
|
extern const int CANNOT_READ_ALL_DATA;
|
||||||
|
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
||||||
|
extern const int CANNOT_PARSE_QUOTED_STRING;
|
||||||
|
extern const int SYNTAX_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
|
const FormatSettings & settings_, bool ignore_spaces_)
|
||||||
|
: RowInputFormatWithDiagnosticInfo(header_, buf, params_), buf(in_), data_types(header_.getDataTypes()),
|
||||||
|
settings(settings_), ignore_spaces(ignore_spaces_)
|
||||||
|
{
|
||||||
|
/// Parse format string for whole input
|
||||||
|
static const String default_format("${data}");
|
||||||
|
const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format;
|
||||||
|
format = ParsedTemplateFormatString(format_str, [&](const String & partName) -> std::optional<size_t>
|
||||||
|
{
|
||||||
|
if (partName == "data")
|
||||||
|
return 0;
|
||||||
|
else if (partName.empty()) /// For skipping some values in prefix and suffix
|
||||||
|
#if !__clang__
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||||
|
#endif
|
||||||
|
/// Suppress false-positive warning (bug in GCC 9: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86465)
|
||||||
|
return {};
|
||||||
|
#if !__clang__
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
|
throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR);
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Validate format string for whole input
|
||||||
|
bool has_data = false;
|
||||||
|
for (size_t i = 0; i < format.columnsCount(); ++i)
|
||||||
|
{
|
||||||
|
if (format.format_idx_to_column_idx[i])
|
||||||
|
{
|
||||||
|
if (has_data)
|
||||||
|
format.throwInvalidFormat("${data} can occur only once", i);
|
||||||
|
if (format.formats[i] != ColumnFormat::None)
|
||||||
|
format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
|
||||||
|
has_data = true;
|
||||||
|
format_data_idx = i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw)
|
||||||
|
format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse format string for rows
|
||||||
|
row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional<size_t>
|
||||||
|
{
|
||||||
|
if (colName.empty())
|
||||||
|
#if !__clang__
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||||
|
#endif
|
||||||
|
return {};
|
||||||
|
#if !__clang__
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
|
return header_.getPositionByName(colName);
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Validate format string for rows
|
||||||
|
std::vector<UInt8> column_in_format(header_.columns(), false);
|
||||||
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
||||||
|
{
|
||||||
|
if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw)
|
||||||
|
row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
|
||||||
|
|
||||||
|
if (row_format.format_idx_to_column_idx[i])
|
||||||
|
{
|
||||||
|
if (row_format.formats[i] == ColumnFormat::None)
|
||||||
|
row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i);
|
||||||
|
|
||||||
|
size_t col_idx = *row_format.format_idx_to_column_idx[i];
|
||||||
|
if (column_in_format[col_idx])
|
||||||
|
row_format.throwInvalidFormat("Duplicate column", i);
|
||||||
|
column_in_format[col_idx] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::readPrefix()
|
||||||
|
{
|
||||||
|
size_t last_successfully_parsed_idx = 0;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
|
||||||
|
}
|
||||||
|
catch (Exception & e)
|
||||||
|
{
|
||||||
|
format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Asserts delimiters and skips fields in prefix or suffix.
|
||||||
|
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
|
||||||
|
/// (most likely false will be returned on first call of checkString(...))
|
||||||
|
template <typename ReturnType>
|
||||||
|
ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
|
||||||
|
{
|
||||||
|
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
if constexpr (throw_exception)
|
||||||
|
assertString(format.delimiters[input_part_beg], buf);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (likely(!checkString(format.delimiters[input_part_beg], buf)))
|
||||||
|
return ReturnType(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (input_part_beg < input_part_end)
|
||||||
|
{
|
||||||
|
skipSpaces();
|
||||||
|
if constexpr (throw_exception)
|
||||||
|
skipField(format.formats[input_part_beg]);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
skipField(format.formats[input_part_beg]);
|
||||||
|
}
|
||||||
|
catch (const Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
|
||||||
|
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
|
||||||
|
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
|
||||||
|
throw;
|
||||||
|
/// If it's parsing error, then suffix is not found
|
||||||
|
return ReturnType(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++input_part_beg;
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
if constexpr (throw_exception)
|
||||||
|
assertString(format.delimiters[input_part_beg], buf);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (likely(!checkString(format.delimiters[input_part_beg], buf)))
|
||||||
|
return ReturnType(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (!throw_exception)
|
||||||
|
return ReturnType(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra)
|
||||||
|
{
|
||||||
|
/// This function can be called again after it returned false
|
||||||
|
if (unlikely(end_of_stream))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
|
||||||
|
if (unlikely(checkForSuffix()))
|
||||||
|
{
|
||||||
|
end_of_stream = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
updateDiagnosticInfo();
|
||||||
|
|
||||||
|
if (likely(row_num != 1))
|
||||||
|
assertString(settings.template_settings.row_between_delimiter, buf);
|
||||||
|
|
||||||
|
extra.read_columns.assign(columns.size(), false);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
||||||
|
{
|
||||||
|
skipSpaces();
|
||||||
|
assertString(row_format.delimiters[i], buf);
|
||||||
|
skipSpaces();
|
||||||
|
if (row_format.format_idx_to_column_idx[i])
|
||||||
|
{
|
||||||
|
size_t col_idx = *row_format.format_idx_to_column_idx[i];
|
||||||
|
deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[i]);
|
||||||
|
extra.read_columns[col_idx] = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
skipField(row_format.formats[i]);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
assertString(row_format.delimiters.back(), buf);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < columns.size(); ++i)
|
||||||
|
if (!extra.read_columns[i])
|
||||||
|
data_types[i]->insertDefaultInto(*columns[i]);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
switch (col_format)
|
||||||
|
{
|
||||||
|
case ColumnFormat::Escaped:
|
||||||
|
type.deserializeAsTextEscaped(column, buf, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Quoted:
|
||||||
|
type.deserializeAsTextQuoted(column, buf, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Csv:
|
||||||
|
type.deserializeAsTextCSV(column, buf, settings);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Json:
|
||||||
|
type.deserializeAsTextJSON(column, buf, settings);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
__builtin_unreachable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
||||||
|
throwUnexpectedEof();
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::skipField(TemplateRowInputFormat::ColumnFormat col_format)
|
||||||
|
{
|
||||||
|
String tmp;
|
||||||
|
constexpr const char * field_name = "<SKIPPED COLUMN>";
|
||||||
|
constexpr size_t field_name_len = 16;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
switch (col_format)
|
||||||
|
{
|
||||||
|
case ColumnFormat::None:
|
||||||
|
/// Empty field, just skip spaces
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Escaped:
|
||||||
|
readEscapedString(tmp, buf);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Quoted:
|
||||||
|
readQuotedString(tmp, buf);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Csv:
|
||||||
|
readCSVString(tmp, buf, settings.csv);
|
||||||
|
break;
|
||||||
|
case ColumnFormat::Json:
|
||||||
|
skipJSONField(buf, StringRef(field_name, field_name_len));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
__builtin_unreachable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
||||||
|
throwUnexpectedEof();
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
|
||||||
|
/// Otherwise returns false
|
||||||
|
bool TemplateRowInputFormat::checkForSuffix()
|
||||||
|
{
|
||||||
|
PeekableReadBufferCheckpoint checkpoint{buf};
|
||||||
|
bool suffix_found = false;
|
||||||
|
size_t last_successfully_parsed_idx = format_data_idx + 1;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
|
||||||
|
}
|
||||||
|
catch (const Exception & e)
|
||||||
|
{
|
||||||
|
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
|
||||||
|
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
|
||||||
|
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(suffix_found))
|
||||||
|
{
|
||||||
|
skipSpaces();
|
||||||
|
if (buf.eof())
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf.rollbackToCheckpoint();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
||||||
|
{
|
||||||
|
out << "Suffix does not match: ";
|
||||||
|
size_t last_successfully_parsed_idx = format_data_idx + 1;
|
||||||
|
const ReadBuffer::Position row_begin_pos = buf.position();
|
||||||
|
bool caught = false;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
PeekableReadBufferCheckpoint checkpoint{buf, true};
|
||||||
|
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
|
||||||
|
}
|
||||||
|
catch (Exception & e)
|
||||||
|
{
|
||||||
|
out << e.message() << " Near column " << last_successfully_parsed_idx;
|
||||||
|
caught = true;
|
||||||
|
}
|
||||||
|
if (!caught)
|
||||||
|
{
|
||||||
|
out << " There is some data after suffix (EOF expected, got ";
|
||||||
|
verbosePrintString(buf.position(), std::min(buf.buffer().end(), buf.position() + 16), out);
|
||||||
|
out << "). ";
|
||||||
|
}
|
||||||
|
out << " Format string (from format_schema): \n" << format.dump() << "\n";
|
||||||
|
|
||||||
|
if (row_begin_pos != buf.position())
|
||||||
|
{
|
||||||
|
/// Pointers to buffer memory were invalidated during checking for suffix
|
||||||
|
out << "\nCannot print more diagnostic info.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
|
||||||
|
out << "\nTrying to parse next row, because suffix does not match:\n";
|
||||||
|
try
|
||||||
|
{
|
||||||
|
if (likely(row_num != 1))
|
||||||
|
assertString(settings.template_settings.row_between_delimiter, buf);
|
||||||
|
}
|
||||||
|
catch (const DB::Exception &)
|
||||||
|
{
|
||||||
|
writeErrorStringForWrongDelimiter(out, "delimiter between rows", settings.template_settings.row_between_delimiter);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
||||||
|
{
|
||||||
|
skipSpaces();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
assertString(row_format.delimiters[i], buf);
|
||||||
|
}
|
||||||
|
catch (const DB::Exception &)
|
||||||
|
{
|
||||||
|
writeErrorStringForWrongDelimiter(out, "delimiter before field " + std::to_string(i), row_format.delimiters[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
if (row_format.format_idx_to_column_idx[i])
|
||||||
|
{
|
||||||
|
auto & header = getPort().getHeader();
|
||||||
|
size_t col_idx = *row_format.format_idx_to_column_idx[i];
|
||||||
|
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx],
|
||||||
|
*columns[col_idx], out, i))
|
||||||
|
{
|
||||||
|
out << "Maybe it's not possible to deserialize field " + std::to_string(i) +
|
||||||
|
" as " + ParsedTemplateFormatString::formatToString(row_format.formats[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
||||||
|
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
|
||||||
|
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
|
||||||
|
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
assertString(row_format.delimiters.back(), buf);
|
||||||
|
}
|
||||||
|
catch (const DB::Exception &)
|
||||||
|
{
|
||||||
|
writeErrorStringForWrongDelimiter(out, "delimiter after last field", row_format.delimiters.back());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim)
|
||||||
|
{
|
||||||
|
out << "ERROR: There is no " << description << ": expected ";
|
||||||
|
verbosePrintString(delim.data(), delim.data() + delim.size(), out);
|
||||||
|
out << ", got ";
|
||||||
|
if (buf.eof())
|
||||||
|
out << "<End of stream>";
|
||||||
|
else
|
||||||
|
verbosePrintString(buf.position(), std::min(buf.position() + delim.size() + 10, buf.buffer().end()), out);
|
||||||
|
out << '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
|
||||||
|
{
|
||||||
|
prev_pos = buf.position();
|
||||||
|
if (row_format.format_idx_to_column_idx[file_column])
|
||||||
|
deserializeField(*type, column, row_format.formats[file_column]);
|
||||||
|
else
|
||||||
|
skipField(row_format.formats[file_column]);
|
||||||
|
curr_pos = buf.position();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
|
||||||
|
{
|
||||||
|
/// Garbage will be considered as wrong delimiter
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TemplateRowInputFormat::allowSyncAfterError() const
|
||||||
|
{
|
||||||
|
return !row_format.delimiters.back().empty() || !settings.template_settings.row_between_delimiter.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::syncAfterError()
|
||||||
|
{
|
||||||
|
bool at_beginning_of_row_or_eof = false;
|
||||||
|
while (!at_beginning_of_row_or_eof)
|
||||||
|
{
|
||||||
|
skipToNextDelimiterOrEof(row_format.delimiters.back());
|
||||||
|
if (buf.eof())
|
||||||
|
{
|
||||||
|
end_of_stream = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
buf.ignore(row_format.delimiters.back().size());
|
||||||
|
|
||||||
|
skipSpaces();
|
||||||
|
if (checkForSuffix())
|
||||||
|
return;
|
||||||
|
|
||||||
|
bool last_delimiter_in_row_found = !row_format.delimiters.back().empty();
|
||||||
|
|
||||||
|
if (last_delimiter_in_row_found && checkString(settings.template_settings.row_between_delimiter, buf))
|
||||||
|
at_beginning_of_row_or_eof = true;
|
||||||
|
else
|
||||||
|
skipToNextDelimiterOrEof(settings.template_settings.row_between_delimiter);
|
||||||
|
|
||||||
|
if (buf.eof())
|
||||||
|
at_beginning_of_row_or_eof = end_of_stream = true;
|
||||||
|
}
|
||||||
|
/// It can happen that buf.position() is not at the beginning of row
|
||||||
|
/// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
|
||||||
|
/// It will cause another parsing error.
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Searches for delimiter in input stream and sets buffer position to the beginning of delimiter (if found) or EOF (if not)
|
||||||
|
void TemplateRowInputFormat::skipToNextDelimiterOrEof(const String & delimiter)
|
||||||
|
{
|
||||||
|
if (delimiter.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
while (!buf.eof())
|
||||||
|
{
|
||||||
|
void * pos = memchr(buf.position(), delimiter[0], buf.available());
|
||||||
|
if (!pos)
|
||||||
|
{
|
||||||
|
buf.position() += buf.available();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf.position() = static_cast<ReadBuffer::Position>(pos);
|
||||||
|
|
||||||
|
PeekableReadBufferCheckpoint checkpoint{buf};
|
||||||
|
if (checkString(delimiter, buf))
|
||||||
|
return;
|
||||||
|
|
||||||
|
buf.rollbackToCheckpoint();
|
||||||
|
++buf.position();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TemplateRowInputFormat::throwUnexpectedEof()
|
||||||
|
{
|
||||||
|
throw Exception("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
|
||||||
|
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
|
||||||
|
ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void registerInputFormatProcessorTemplate(FormatFactory & factory)
|
||||||
|
{
|
||||||
|
for (bool ignore_spaces : {false, true})
|
||||||
|
{
|
||||||
|
factory.registerInputFormatProcessor(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=](
|
||||||
|
ReadBuffer & buf,
|
||||||
|
const Block & sample,
|
||||||
|
const Context &,
|
||||||
|
IRowInputFormat::Params params,
|
||||||
|
const FormatSettings & settings)
|
||||||
|
{
|
||||||
|
return std::make_shared<TemplateRowInputFormat>(sample, buf, params, settings, ignore_spaces);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
61
dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h
Normal file
61
dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Core/Block.h>
|
||||||
|
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
|
||||||
|
#include <Formats/FormatSettings.h>
|
||||||
|
#include <Formats/ParsedTemplateFormatString.h>
|
||||||
|
#include <IO/ReadHelpers.h>
|
||||||
|
#include <IO/PeekableReadBuffer.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo
|
||||||
|
{
|
||||||
|
using ColumnFormat = ParsedTemplateFormatString::ColumnFormat;
|
||||||
|
public:
|
||||||
|
TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
|
||||||
|
const FormatSettings & settings_, bool ignore_spaces_);
|
||||||
|
|
||||||
|
String getName() const override { return "TemplateRowInputFormat"; }
|
||||||
|
|
||||||
|
bool readRow(MutableColumns & columns, RowReadExtension & extra) override;
|
||||||
|
|
||||||
|
void readPrefix() override;
|
||||||
|
|
||||||
|
bool allowSyncAfterError() const override;
|
||||||
|
void syncAfterError() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format);
|
||||||
|
void skipField(ColumnFormat col_format);
|
||||||
|
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); }
|
||||||
|
|
||||||
|
template <typename ReturnType = void>
|
||||||
|
ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end);
|
||||||
|
bool checkForSuffix();
|
||||||
|
[[noreturn]] void throwUnexpectedEof();
|
||||||
|
|
||||||
|
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
|
||||||
|
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos,
|
||||||
|
ReadBuffer::Position & curr_pos) override;
|
||||||
|
bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override;
|
||||||
|
void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim);
|
||||||
|
|
||||||
|
void skipToNextDelimiterOrEof(const String & delimiter);
|
||||||
|
|
||||||
|
private:
|
||||||
|
PeekableReadBuffer buf;
|
||||||
|
DataTypes data_types;
|
||||||
|
|
||||||
|
FormatSettings settings;
|
||||||
|
ParsedTemplateFormatString format;
|
||||||
|
ParsedTemplateFormatString row_format;
|
||||||
|
const bool ignore_spaces;
|
||||||
|
|
||||||
|
size_t format_data_idx;
|
||||||
|
bool end_of_stream = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
167
dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
Normal file
167
dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
|
||||||
|
#include <Formats/verbosePrintString.h>
|
||||||
|
#include <IO/Operators.h>
|
||||||
|
#include <IO/WriteBufferFromString.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int LOGICAL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DB::RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_)
|
||||||
|
: IRowInputFormat(header_, in_, params_)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void DB::RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
|
||||||
|
{
|
||||||
|
++row_num;
|
||||||
|
|
||||||
|
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
||||||
|
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset();
|
||||||
|
|
||||||
|
offset_of_prev_row = offset_of_current_row;
|
||||||
|
offset_of_current_row = in.offset();
|
||||||
|
}
|
||||||
|
|
||||||
|
String DB::RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
|
||||||
|
{
|
||||||
|
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
|
||||||
|
return {};
|
||||||
|
|
||||||
|
WriteBufferFromOwnString out;
|
||||||
|
|
||||||
|
auto & header = getPort().getHeader();
|
||||||
|
MutableColumns columns = header.cloneEmptyColumns();
|
||||||
|
|
||||||
|
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
|
||||||
|
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
|
||||||
|
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
|
||||||
|
{
|
||||||
|
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
max_length_of_column_name = 0;
|
||||||
|
for (size_t i = 0; i < header.columns(); ++i)
|
||||||
|
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
|
||||||
|
max_length_of_column_name = header.safeGetByPosition(i).name.size();
|
||||||
|
|
||||||
|
max_length_of_data_type_name = 0;
|
||||||
|
for (size_t i = 0; i < header.columns(); ++i)
|
||||||
|
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
|
||||||
|
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
|
||||||
|
|
||||||
|
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
|
||||||
|
|
||||||
|
if (offset_of_prev_row <= in.buffer().size())
|
||||||
|
{
|
||||||
|
in.position() = in.buffer().begin() + offset_of_prev_row;
|
||||||
|
|
||||||
|
out << "\nRow " << (row_num - 1) << ":\n";
|
||||||
|
if (!parseRowAndPrintDiagnosticInfo(columns, out))
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (in.buffer().size() < offset_of_current_row)
|
||||||
|
{
|
||||||
|
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
in.position() = in.buffer().begin() + offset_of_current_row;
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "\nRow " << row_num << ":\n";
|
||||||
|
parseRowAndPrintDiagnosticInfo(columns, out);
|
||||||
|
out << "\n";
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name,
|
||||||
|
const DataTypePtr & type,
|
||||||
|
IColumn & column,
|
||||||
|
WriteBuffer & out,
|
||||||
|
size_t file_column)
|
||||||
|
{
|
||||||
|
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
||||||
|
<< "name: " << alignedName(col_name, max_length_of_column_name)
|
||||||
|
<< "type: " << alignedName(type->getName(), max_length_of_data_type_name);
|
||||||
|
|
||||||
|
auto prev_position = in.position();
|
||||||
|
auto curr_position = in.position();
|
||||||
|
std::exception_ptr exception;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
tryDeserializeFiled(type, column, file_column, prev_position, curr_position);
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
exception = std::current_exception();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curr_position < prev_position)
|
||||||
|
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
|
||||||
|
if (isNativeNumber(type) || isDateOrDateTime(type))
|
||||||
|
{
|
||||||
|
/// An empty string instead of a value.
|
||||||
|
if (curr_position == prev_position)
|
||||||
|
{
|
||||||
|
out << "ERROR: text ";
|
||||||
|
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
|
||||||
|
out << " is not like " << type->getName() << "\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "parsed text: ";
|
||||||
|
verbosePrintString(prev_position, curr_position, out);
|
||||||
|
|
||||||
|
if (exception)
|
||||||
|
{
|
||||||
|
if (type->getName() == "DateTime")
|
||||||
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
||||||
|
else if (type->getName() == "Date")
|
||||||
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
||||||
|
else
|
||||||
|
out << "ERROR\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "\n";
|
||||||
|
|
||||||
|
if (type->haveMaximumSizeOfValue())
|
||||||
|
{
|
||||||
|
if (isGarbageAfterField(file_column, curr_position))
|
||||||
|
{
|
||||||
|
out << "ERROR: garbage after " << type->getName() << ": ";
|
||||||
|
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
|
||||||
|
out << "\n";
|
||||||
|
|
||||||
|
if (type->getName() == "DateTime")
|
||||||
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
||||||
|
else if (type->getName() == "Date")
|
||||||
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
String RowInputFormatWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const
|
||||||
|
{
|
||||||
|
size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0;
|
||||||
|
return name + ", " + std::string(spaces_count, ' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,46 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Core/Block.h>
|
||||||
|
#include <Processors/Formats/IRowInputFormat.h>
|
||||||
|
#include <IO/ReadBuffer.h>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
class RowInputFormatWithDiagnosticInfo : public IRowInputFormat
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_);
|
||||||
|
|
||||||
|
String getDiagnosticInfo() override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void updateDiagnosticInfo();
|
||||||
|
bool deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column,
|
||||||
|
WriteBuffer & out, size_t file_column);
|
||||||
|
String alignedName(const String & name, size_t max_length) const;
|
||||||
|
|
||||||
|
virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0;
|
||||||
|
virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
|
||||||
|
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0;
|
||||||
|
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
|
||||||
|
|
||||||
|
/// For convenient diagnostics in case of an error.
|
||||||
|
size_t row_num = 0;
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// How many bytes were read, not counting those still in the buffer.
|
||||||
|
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
||||||
|
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
|
||||||
|
|
||||||
|
size_t offset_of_current_row = std::numeric_limits<size_t>::max();
|
||||||
|
size_t offset_of_prev_row = std::numeric_limits<size_t>::max();
|
||||||
|
|
||||||
|
/// For alignment of diagnostic info.
|
||||||
|
size_t max_length_of_column_name = 0;
|
||||||
|
size_t max_length_of_data_type_name = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
@ -21,21 +21,13 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
|
|||||||
header(header_), sync(sync_), skip_offsets(skip_offsets_),
|
header(header_), sync(sync_), skip_offsets(skip_offsets_),
|
||||||
already_written_offset_columns(already_written_offset_columns_)
|
already_written_offset_columns(already_written_offset_columns_)
|
||||||
{
|
{
|
||||||
}
|
|
||||||
|
|
||||||
void MergedColumnOnlyOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
if (!initialized)
|
|
||||||
{
|
|
||||||
column_streams.clear();
|
|
||||||
serialization_states.clear();
|
|
||||||
serialization_states.reserve(header.columns());
|
serialization_states.reserve(header.columns());
|
||||||
WrittenOffsetColumns tmp_offset_columns;
|
WrittenOffsetColumns tmp_offset_columns;
|
||||||
IDataType::SerializeBinaryBulkSettings settings;
|
IDataType::SerializeBinaryBulkSettings settings;
|
||||||
|
|
||||||
for (const auto & column_name : header.getNames())
|
for (const auto & column_name : header.getNames())
|
||||||
{
|
{
|
||||||
const auto & col = block.getByName(column_name);
|
const auto & col = header.getByName(column_name);
|
||||||
|
|
||||||
const auto columns = storage.getColumns();
|
const auto columns = storage.getColumns();
|
||||||
addStreams(part_path, col.name, *col.type, columns.getCodecOrDefault(col.name, codec), 0, skip_offsets);
|
addStreams(part_path, col.name, *col.type, columns.getCodecOrDefault(col.name, codec), 0, skip_offsets);
|
||||||
@ -45,10 +37,10 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
|
|||||||
}
|
}
|
||||||
|
|
||||||
initSkipIndices();
|
initSkipIndices();
|
||||||
|
}
|
||||||
|
|
||||||
initialized = true;
|
void MergedColumnOnlyOutputStream::write(const Block & block)
|
||||||
}
|
{
|
||||||
|
|
||||||
std::set<String> skip_indexes_column_names_set;
|
std::set<String> skip_indexes_column_names_set;
|
||||||
for (const auto & index : skip_indices)
|
for (const auto & index : skip_indices)
|
||||||
std::copy(index->columns.cbegin(), index->columns.cend(),
|
std::copy(index->columns.cbegin(), index->columns.cend(),
|
||||||
@ -68,7 +60,6 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
|
|||||||
if (!rows)
|
if (!rows)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
||||||
size_t new_index_offset = 0;
|
size_t new_index_offset = 0;
|
||||||
size_t new_current_mark = 0;
|
size_t new_current_mark = 0;
|
||||||
WrittenOffsetColumns offset_columns = already_written_offset_columns;
|
WrittenOffsetColumns offset_columns = already_written_offset_columns;
|
||||||
@ -106,7 +97,8 @@ MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndG
|
|||||||
serialize_settings.getter = createStreamGetter(column.name, already_written_offset_columns, skip_offsets);
|
serialize_settings.getter = createStreamGetter(column.name, already_written_offset_columns, skip_offsets);
|
||||||
column.type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]);
|
column.type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]);
|
||||||
|
|
||||||
if (with_final_mark)
|
/// We wrote at least one row
|
||||||
|
if (with_final_mark && (index_offset != 0 || current_mark != 0))
|
||||||
writeFinalMark(column.name, column.type, offset_columns, skip_offsets, serialize_settings.path);
|
writeFinalMark(column.name, column.type, offset_columns, skip_offsets, serialize_settings.path);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,7 +117,6 @@ MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndG
|
|||||||
|
|
||||||
column_streams.clear();
|
column_streams.clear();
|
||||||
serialization_states.clear();
|
serialization_states.clear();
|
||||||
initialized = false;
|
|
||||||
|
|
||||||
return checksums;
|
return checksums;
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,6 @@ public:
|
|||||||
private:
|
private:
|
||||||
Block header;
|
Block header;
|
||||||
|
|
||||||
bool initialized = false;
|
|
||||||
bool sync;
|
bool sync;
|
||||||
bool skip_offsets;
|
bool skip_offsets;
|
||||||
|
|
||||||
|
@ -34,7 +34,8 @@ set the following environment variables:
|
|||||||
|
|
||||||
### Running with runner script
|
### Running with runner script
|
||||||
|
|
||||||
The only requirement is fresh docker configured docker.
|
The only requirement is fresh configured docker and
|
||||||
|
docker pull yandex/clickhouse-integration-tests-runner
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
* If you want to run integration tests without `sudo` you have to add your user to docker group `sudo usermod -aG docker $USER`. [More information](https://docs.docker.com/install/linux/linux-postinstall/) about docker configuration.
|
* If you want to run integration tests without `sudo` you have to add your user to docker group `sudo usermod -aG docker $USER`. [More information](https://docs.docker.com/install/linux/linux-postinstall/) about docker configuration.
|
||||||
|
@ -723,7 +723,8 @@ class ClickHouseInstance:
|
|||||||
os.mkdir(config_d_dir)
|
os.mkdir(config_d_dir)
|
||||||
os.mkdir(users_d_dir)
|
os.mkdir(users_d_dir)
|
||||||
|
|
||||||
shutil.copy(p.join(HELPERS_DIR, 'common_instance_config.xml'), config_d_dir)
|
# The file is named with 0_ prefix to be processed before other configuration overloads.
|
||||||
|
shutil.copy(p.join(HELPERS_DIR, '0_common_instance_config.xml'), config_d_dir)
|
||||||
|
|
||||||
# Generate and write macros file
|
# Generate and write macros file
|
||||||
macros = self.macros.copy()
|
macros = self.macros.copy()
|
||||||
|
@ -1,4 +0,0 @@
|
|||||||
*
|
|
||||||
!.gitignore
|
|
||||||
!source.tsv
|
|
||||||
!dictionary_preset*
|
|
@ -1,411 +0,0 @@
|
|||||||
import pytest
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
from helpers.cluster import ClickHouseCluster
|
|
||||||
from helpers.test_tools import TSV, assert_eq_with_retry
|
|
||||||
from generate_dictionaries import generate_structure, generate_dictionaries, DictionaryTestTable
|
|
||||||
|
|
||||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
|
|
||||||
cluster = None
|
|
||||||
instance = None
|
|
||||||
test_table = None
|
|
||||||
|
|
||||||
|
|
||||||
def get_status(dictionary_name):
|
|
||||||
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
|
|
||||||
|
|
||||||
|
|
||||||
def get_last_exception(dictionary_name):
|
|
||||||
return instance.query("SELECT last_exception FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n").replace("\\'", "'")
|
|
||||||
|
|
||||||
|
|
||||||
def get_loading_start_time(dictionary_name):
|
|
||||||
s = instance.query("SELECT loading_start_time FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
|
|
||||||
if s == "0000-00-00 00:00:00":
|
|
||||||
return None
|
|
||||||
return time.strptime(s, "%Y-%m-%d %H:%M:%S")
|
|
||||||
|
|
||||||
|
|
||||||
def get_loading_duration(dictionary_name):
|
|
||||||
return float(instance.query("SELECT loading_duration FROM system.dictionaries WHERE name='" + dictionary_name + "'"))
|
|
||||||
|
|
||||||
|
|
||||||
def replace_in_file_in_container(file_name, what, replace_with):
|
|
||||||
instance.exec_in_container('sed -i "s/' + what + '/' + replace_with + '/g" ' + file_name)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_module(module):
|
|
||||||
global cluster
|
|
||||||
global instance
|
|
||||||
global test_table
|
|
||||||
|
|
||||||
structure = generate_structure()
|
|
||||||
dictionary_files = generate_dictionaries(os.path.join(SCRIPT_DIR, 'configs/dictionaries'), structure)
|
|
||||||
|
|
||||||
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
|
|
||||||
instance = cluster.add_instance('instance', main_configs=dictionary_files)
|
|
||||||
test_table = DictionaryTestTable(os.path.join(SCRIPT_DIR, 'configs/dictionaries/source.tsv'))
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def started_cluster():
|
|
||||||
try:
|
|
||||||
cluster.start()
|
|
||||||
instance.query("CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary")
|
|
||||||
test_table.create_clickhouse_source(instance)
|
|
||||||
for line in TSV(instance.query('select name from system.dictionaries')).lines:
|
|
||||||
print line,
|
|
||||||
|
|
||||||
# Create table `test.small_dict_source`
|
|
||||||
instance.query('''
|
|
||||||
drop table if exists test.small_dict_source;
|
|
||||||
create table test.small_dict_source (id UInt64, a String, b Int32, c Float64) engine=Log;
|
|
||||||
insert into test.small_dict_source values (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);
|
|
||||||
''')
|
|
||||||
|
|
||||||
yield cluster
|
|
||||||
|
|
||||||
finally:
|
|
||||||
cluster.shutdown()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[
|
|
||||||
# name, keys, use_parent
|
|
||||||
('clickhouse_hashed', ('id',), True),
|
|
||||||
('clickhouse_flat', ('id',), True),
|
|
||||||
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
|
|
||||||
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
|
|
||||||
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
|
|
||||||
],
|
|
||||||
ids=['clickhouse_hashed', 'clickhouse_flat',
|
|
||||||
'clickhouse_complex_integers_key_hashed',
|
|
||||||
'clickhouse_complex_mixed_key_hashed',
|
|
||||||
'clickhouse_range_hashed']
|
|
||||||
)
|
|
||||||
def dictionary_structure(started_cluster, request):
|
|
||||||
return request.param
|
|
||||||
|
|
||||||
|
|
||||||
def test_select_all(dictionary_structure):
|
|
||||||
name, keys, use_parent = dictionary_structure
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
structure = test_table.get_structure_for_keys(keys, use_parent)
|
|
||||||
query('''
|
|
||||||
DROP TABLE IF EXISTS test.{0}
|
|
||||||
'''.format(name))
|
|
||||||
|
|
||||||
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
|
|
||||||
TSV(query(create_query))
|
|
||||||
|
|
||||||
result = TSV(query('select * from test.{0}'.format(name)))
|
|
||||||
|
|
||||||
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
|
|
||||||
print test_table.process_diff(diff)
|
|
||||||
assert not diff
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[
|
|
||||||
# name, keys, use_parent
|
|
||||||
('clickhouse_cache', ('id',), True),
|
|
||||||
('clickhouse_complex_integers_key_cache', ('key0', 'key1'), False),
|
|
||||||
('clickhouse_complex_mixed_key_cache', ('key0_str', 'key1'), False)
|
|
||||||
],
|
|
||||||
ids=['clickhouse_cache', 'clickhouse_complex_integers_key_cache', 'clickhouse_complex_mixed_key_cache']
|
|
||||||
)
|
|
||||||
def cached_dictionary_structure(started_cluster, request):
|
|
||||||
return request.param
|
|
||||||
|
|
||||||
|
|
||||||
def test_select_all_from_cached(cached_dictionary_structure):
|
|
||||||
name, keys, use_parent = cached_dictionary_structure
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
structure = test_table.get_structure_for_keys(keys, use_parent)
|
|
||||||
query('''
|
|
||||||
DROP TABLE IF EXISTS test.{0}
|
|
||||||
'''.format(name))
|
|
||||||
|
|
||||||
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
|
|
||||||
TSV(query(create_query))
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
result = TSV(query('select * from test.{0}'.format(name)))
|
|
||||||
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=False)
|
|
||||||
print test_table.process_diff(diff)
|
|
||||||
assert not diff
|
|
||||||
|
|
||||||
key = []
|
|
||||||
for key_name in keys:
|
|
||||||
if key_name.endswith('str'):
|
|
||||||
key.append("'" + str(i) + "'")
|
|
||||||
else:
|
|
||||||
key.append(str(i))
|
|
||||||
if len(key) == 1:
|
|
||||||
key = 'toUInt64(' + str(i) + ')'
|
|
||||||
else:
|
|
||||||
key = str('(' + ','.join(key) + ')')
|
|
||||||
query("select dictGetUInt8('{0}', 'UInt8_', {1})".format(name, key))
|
|
||||||
|
|
||||||
result = TSV(query('select * from test.{0}'.format(name)))
|
|
||||||
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
|
|
||||||
print test_table.process_diff(diff)
|
|
||||||
assert not diff
|
|
||||||
|
|
||||||
|
|
||||||
def test_null_value(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
assert TSV(query("select dictGetUInt8('clickhouse_cache', 'UInt8_', toUInt64(12121212))")) == TSV("1")
|
|
||||||
assert TSV(query("select dictGetString('clickhouse_cache', 'String_', toUInt64(12121212))")) == TSV("implicit-default")
|
|
||||||
assert TSV(query("select dictGetDate('clickhouse_cache', 'Date_', toUInt64(12121212))")) == TSV("2015-11-25")
|
|
||||||
|
|
||||||
# Check, that empty null_value interprets as default value
|
|
||||||
assert TSV(query("select dictGetUInt64('clickhouse_cache', 'UInt64_', toUInt64(12121212))")) == TSV("0")
|
|
||||||
assert TSV(query("select dictGetDateTime('clickhouse_cache', 'DateTime_', toUInt64(12121212))")) == TSV("0000-00-00 00:00:00")
|
|
||||||
|
|
||||||
|
|
||||||
def test_dictionary_dependency(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
# dictionaries_lazy_load == false, so these dictionary are not loaded.
|
|
||||||
assert get_status('dep_x') == 'NOT_LOADED'
|
|
||||||
assert get_status('dep_y') == 'NOT_LOADED'
|
|
||||||
assert get_status('dep_z') == 'NOT_LOADED'
|
|
||||||
|
|
||||||
# Dictionary 'dep_x' depends on 'dep_z', which depends on 'dep_y'.
|
|
||||||
# So they all should be loaded at once.
|
|
||||||
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(1))") == "air\n"
|
|
||||||
assert get_status('dep_x') == 'LOADED'
|
|
||||||
assert get_status('dep_y') == 'LOADED'
|
|
||||||
assert get_status('dep_z') == 'LOADED'
|
|
||||||
|
|
||||||
# Other dictionaries should work too.
|
|
||||||
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(1))") == "air\n"
|
|
||||||
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(1))") == "air\n"
|
|
||||||
|
|
||||||
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
|
|
||||||
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "YY\n"
|
|
||||||
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
|
|
||||||
|
|
||||||
# Update the source table.
|
|
||||||
query("insert into test.small_dict_source values (3, 'fire', 30, 8)")
|
|
||||||
|
|
||||||
# Wait for dictionaries to be reloaded.
|
|
||||||
assert_eq_with_retry(instance, "SELECT dictHas('dep_y', toUInt64(3))", "1", sleep_time = 2, retry_count = 10)
|
|
||||||
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
|
|
||||||
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
|
|
||||||
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
|
|
||||||
|
|
||||||
# dep_x and dep_z are updated only when there `intDiv(count(), 4)` is changed.
|
|
||||||
query("insert into test.small_dict_source values (4, 'ether', 404, 0.001)")
|
|
||||||
assert_eq_with_retry(instance, "SELECT dictHas('dep_x', toUInt64(4))", "1", sleep_time = 2, retry_count = 10)
|
|
||||||
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "fire\n"
|
|
||||||
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
|
|
||||||
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "fire\n"
|
|
||||||
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(4))") == "ether\n"
|
|
||||||
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
|
|
||||||
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ether\n"
|
|
||||||
|
|
||||||
|
|
||||||
def test_reload_while_loading(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
|
||||||
assert get_status('longload') == "NOT_LOADED"
|
|
||||||
assert get_loading_duration('longload') == 0
|
|
||||||
|
|
||||||
# It's not possible to get a value from the dictionary within 1.0 second, so the following query fails by timeout.
|
|
||||||
assert query("SELECT dictGetInt32('longload', 'a', toUInt64(5))", timeout = 1, ignore_error = True) == ""
|
|
||||||
|
|
||||||
# The dictionary is now loading.
|
|
||||||
assert get_status('longload') == "LOADING"
|
|
||||||
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
|
|
||||||
assert duration > 0
|
|
||||||
|
|
||||||
time.sleep(0.5) # Still loading.
|
|
||||||
assert get_status('longload') == "LOADING"
|
|
||||||
prev_start_time, prev_duration = start_time, duration
|
|
||||||
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
|
|
||||||
assert start_time == prev_start_time
|
|
||||||
assert duration >= prev_duration
|
|
||||||
|
|
||||||
# SYSTEM RELOAD DICTIONARY should restart loading.
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'longload'")
|
|
||||||
assert get_status('longload') == "LOADING"
|
|
||||||
prev_start_time, prev_duration = start_time, duration
|
|
||||||
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
|
|
||||||
assert start_time > prev_start_time
|
|
||||||
assert duration < prev_duration
|
|
||||||
|
|
||||||
time.sleep(0.5) # Still loading.
|
|
||||||
assert get_status('longload') == "LOADING"
|
|
||||||
prev_start_time, prev_duration = start_time, duration
|
|
||||||
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
|
|
||||||
assert start_time == prev_start_time
|
|
||||||
assert duration >= prev_duration
|
|
||||||
|
|
||||||
# SYSTEM RELOAD DICTIONARIES should restart loading again.
|
|
||||||
query("SYSTEM RELOAD DICTIONARIES")
|
|
||||||
assert get_status('longload') == "LOADING"
|
|
||||||
prev_start_time, prev_duration = start_time, duration
|
|
||||||
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
|
|
||||||
assert start_time > prev_start_time
|
|
||||||
assert duration < prev_duration
|
|
||||||
|
|
||||||
# Changing the configuration file should restart loading one more time.
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_longload.xml', 'sleep 100', 'sleep 0')
|
|
||||||
time.sleep(5) # Configuration files are reloaded once in 5 seconds.
|
|
||||||
|
|
||||||
# This time loading should finish quickly.
|
|
||||||
assert get_status('longload') == "LOADED"
|
|
||||||
assert query("SELECT dictGetInt32('longload', 'a', toUInt64(5))") == "6\n"
|
|
||||||
|
|
||||||
|
|
||||||
def test_reload_after_loading(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "8\n"
|
|
||||||
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
|
|
||||||
|
|
||||||
# Change the dictionaries' data.
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '8', '81')
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '10', '101')
|
|
||||||
|
|
||||||
# SYSTEM RELOAD 'name' reloads only the specified dictionary.
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'cmd'")
|
|
||||||
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "81\n"
|
|
||||||
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
|
|
||||||
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'file'")
|
|
||||||
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "81\n"
|
|
||||||
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "101\n"
|
|
||||||
|
|
||||||
# SYSTEM RELOAD DICTIONARIES reloads all loaded dictionaries.
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '81', '82')
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '101', '102')
|
|
||||||
query("SYSTEM RELOAD DICTIONARIES")
|
|
||||||
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "82\n"
|
|
||||||
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n"
|
|
||||||
|
|
||||||
# Configuration files are reloaded and lifetimes are checked automatically once in 5 seconds.
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '82', '83')
|
|
||||||
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '102', '103')
|
|
||||||
time.sleep(5)
|
|
||||||
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n"
|
|
||||||
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "83\n"
|
|
||||||
|
|
||||||
|
|
||||||
def test_reload_after_fail_by_system_reload(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
|
||||||
assert get_status("no_file") == "NOT_LOADED"
|
|
||||||
|
|
||||||
# We expect an error because the file source doesn't exist.
|
|
||||||
expected_error = "No such file"
|
|
||||||
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
|
|
||||||
assert get_status("no_file") == "FAILED"
|
|
||||||
|
|
||||||
# SYSTEM RELOAD should not change anything now, the status is still FAILED.
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
|
||||||
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
|
|
||||||
assert get_status("no_file") == "FAILED"
|
|
||||||
|
|
||||||
# Creating the file source makes the dictionary able to load.
|
|
||||||
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/dictionary_preset_file.txt"), "/etc/clickhouse-server/config.d/dictionary_preset_no_file.txt")
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
|
||||||
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
|
|
||||||
assert get_status("no_file") == "LOADED"
|
|
||||||
|
|
||||||
# Removing the file source should not spoil the loaded dictionary.
|
|
||||||
instance.exec_in_container("rm /etc/clickhouse-server/config.d/dictionary_preset_no_file.txt")
|
|
||||||
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
|
||||||
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
|
|
||||||
assert get_status("no_file") == "LOADED"
|
|
||||||
|
|
||||||
|
|
||||||
def test_reload_after_fail_by_timer(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
|
|
||||||
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
|
||||||
assert get_status("no_file_2") == "NOT_LOADED"
|
|
||||||
|
|
||||||
# We expect an error because the file source doesn't exist.
|
|
||||||
expected_error = "No such file"
|
|
||||||
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
|
|
||||||
assert get_status("no_file_2") == "FAILED"
|
|
||||||
|
|
||||||
# Passed time should not change anything now, the status is still FAILED.
|
|
||||||
time.sleep(6);
|
|
||||||
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
|
|
||||||
assert get_status("no_file_2") == "FAILED"
|
|
||||||
|
|
||||||
# Creating the file source makes the dictionary able to load.
|
|
||||||
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/dictionary_preset_file.txt"), "/etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt")
|
|
||||||
time.sleep(6);
|
|
||||||
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
|
|
||||||
assert get_status("no_file_2") == "LOADED"
|
|
||||||
|
|
||||||
# Removing the file source should not spoil the loaded dictionary.
|
|
||||||
instance.exec_in_container("rm /etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt")
|
|
||||||
time.sleep(6);
|
|
||||||
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
|
|
||||||
assert get_status("no_file_2") == "LOADED"
|
|
||||||
|
|
||||||
|
|
||||||
def test_reload_after_fail_in_cache_dictionary(started_cluster):
|
|
||||||
query = instance.query
|
|
||||||
query_and_get_error = instance.query_and_get_error
|
|
||||||
|
|
||||||
# Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond.
|
|
||||||
expected_error = "Table test.xypairs doesn't exist"
|
|
||||||
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))")
|
|
||||||
assert get_status("cache_xypairs") == "LOADED"
|
|
||||||
assert expected_error in get_last_exception("cache_xypairs")
|
|
||||||
|
|
||||||
# Create table `test.xypairs`.
|
|
||||||
query('''
|
|
||||||
drop table if exists test.xypairs;
|
|
||||||
create table test.xypairs (x UInt64, y UInt64) engine=Log;
|
|
||||||
insert into test.xypairs values (1, 56), (3, 78);
|
|
||||||
''')
|
|
||||||
|
|
||||||
# Cache dictionary now works.
|
|
||||||
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "56", ignore_error=True)
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
|
||||||
assert get_last_exception("cache_xypairs") == ""
|
|
||||||
|
|
||||||
# Drop table `test.xypairs`.
|
|
||||||
query('drop table if exists test.xypairs')
|
|
||||||
|
|
||||||
# Values are cached so we can get them.
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
|
||||||
assert get_last_exception("cache_xypairs") == ""
|
|
||||||
|
|
||||||
# But we can't get a value from the source table which isn't cached.
|
|
||||||
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
|
|
||||||
assert expected_error in get_last_exception("cache_xypairs")
|
|
||||||
|
|
||||||
# Passed time should not spoil the cache.
|
|
||||||
time.sleep(5);
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
|
||||||
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
|
|
||||||
assert expected_error in get_last_exception("cache_xypairs")
|
|
||||||
|
|
||||||
# Create table `test.xypairs` again with changed values.
|
|
||||||
query('''
|
|
||||||
drop table if exists test.xypairs;
|
|
||||||
create table test.xypairs (x UInt64, y UInt64) engine=Log;
|
|
||||||
insert into test.xypairs values (1, 57), (3, 79);
|
|
||||||
''')
|
|
||||||
|
|
||||||
# The cache dictionary returns new values now.
|
|
||||||
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "57")
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
|
||||||
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(3))") == "79"
|
|
||||||
assert get_last_exception("cache_xypairs") == ""
|
|
@ -0,0 +1,30 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<logger>
|
||||||
|
<level>trace</level>
|
||||||
|
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
|
||||||
|
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
|
||||||
|
<size>1000M</size>
|
||||||
|
<count>10</count>
|
||||||
|
</logger>
|
||||||
|
|
||||||
|
<tcp_port>9000</tcp_port>
|
||||||
|
<listen_host>127.0.0.1</listen_host>
|
||||||
|
|
||||||
|
<openSSL>
|
||||||
|
<client>
|
||||||
|
<cacheSessions>true</cacheSessions>
|
||||||
|
<verificationMode>none</verificationMode>
|
||||||
|
<invalidCertificateHandler>
|
||||||
|
<name>AcceptCertificateHandler</name>
|
||||||
|
</invalidCertificateHandler>
|
||||||
|
</client>
|
||||||
|
</openSSL>
|
||||||
|
|
||||||
|
<max_concurrent_queries>500</max_concurrent_queries>
|
||||||
|
<mark_cache_size>5368709120</mark_cache_size>
|
||||||
|
<path>./clickhouse/</path>
|
||||||
|
<users_config>users.xml</users_config>
|
||||||
|
|
||||||
|
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
|
||||||
|
</yandex>
|
@ -8,7 +8,7 @@
|
|||||||
<user>default</user>
|
<user>default</user>
|
||||||
<password></password>
|
<password></password>
|
||||||
<db>test</db>
|
<db>test</db>
|
||||||
<table>small_dict_source</table>
|
<table>elements</table>
|
||||||
</clickhouse>
|
</clickhouse>
|
||||||
</source>
|
</source>
|
||||||
<lifetime>5</lifetime>
|
<lifetime>5</lifetime>
|
@ -0,0 +1,76 @@
|
|||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
from helpers.test_tools import assert_eq_with_retry
|
||||||
|
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
DICTIONARY_FILES = ['configs/dictionaries/dep_x.xml', 'configs/dictionaries/dep_y.xml', 'configs/dictionaries/dep_z.xml']
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
|
||||||
|
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def started_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
|
||||||
|
instance.query('''
|
||||||
|
CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
|
||||||
|
CREATE DATABASE IF NOT EXISTS test;
|
||||||
|
DROP TABLE IF EXISTS test.elements;
|
||||||
|
CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;
|
||||||
|
INSERT INTO test.elements VALUES (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);
|
||||||
|
''')
|
||||||
|
|
||||||
|
yield cluster
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def get_status(dictionary_name):
|
||||||
|
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_data(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
# dictionaries_lazy_load == false, so these dictionary are not loaded.
|
||||||
|
assert get_status('dep_x') == 'NOT_LOADED'
|
||||||
|
assert get_status('dep_y') == 'NOT_LOADED'
|
||||||
|
assert get_status('dep_z') == 'NOT_LOADED'
|
||||||
|
|
||||||
|
# Dictionary 'dep_x' depends on 'dep_z', which depends on 'dep_y'.
|
||||||
|
# So they all should be loaded at once.
|
||||||
|
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(1))") == "air\n"
|
||||||
|
assert get_status('dep_x') == 'LOADED'
|
||||||
|
assert get_status('dep_y') == 'LOADED'
|
||||||
|
assert get_status('dep_z') == 'LOADED'
|
||||||
|
|
||||||
|
# Other dictionaries should work too.
|
||||||
|
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(1))") == "air\n"
|
||||||
|
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(1))") == "air\n"
|
||||||
|
|
||||||
|
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
|
||||||
|
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "YY\n"
|
||||||
|
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
|
||||||
|
|
||||||
|
# Update the source table.
|
||||||
|
query("INSERT INTO test.elements VALUES (3, 'fire', 30, 8)")
|
||||||
|
|
||||||
|
# Wait for dictionaries to be reloaded.
|
||||||
|
assert_eq_with_retry(instance, "SELECT dictHas('dep_y', toUInt64(3))", "1", sleep_time = 2, retry_count = 10)
|
||||||
|
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
|
||||||
|
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
|
||||||
|
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
|
||||||
|
|
||||||
|
# dep_x and dep_z are updated only when there `intDiv(count(), 4)` is changed.
|
||||||
|
query("INSERT INTO test.elements VALUES (4, 'ether', 404, 0.001)")
|
||||||
|
assert_eq_with_retry(instance, "SELECT dictHas('dep_x', toUInt64(4))", "1", sleep_time = 2, retry_count = 10)
|
||||||
|
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "fire\n"
|
||||||
|
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
|
||||||
|
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "fire\n"
|
||||||
|
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(4))") == "ether\n"
|
||||||
|
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
|
||||||
|
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ether\n"
|
@ -0,0 +1,113 @@
|
|||||||
|
<yandex>
|
||||||
|
<dictionary>
|
||||||
|
<name>cache</name>
|
||||||
|
|
||||||
|
<source>
|
||||||
|
<clickhouse>
|
||||||
|
<host>localhost</host>
|
||||||
|
<port>9000</port>
|
||||||
|
<user>default</user>
|
||||||
|
<password></password>
|
||||||
|
<db>test</db>
|
||||||
|
<table>source</table>
|
||||||
|
</clickhouse>
|
||||||
|
</source>
|
||||||
|
|
||||||
|
<lifetime>0</lifetime>
|
||||||
|
|
||||||
|
<layout>
|
||||||
|
<cache><size_in_cells>128</size_in_cells></cache>
|
||||||
|
</layout>
|
||||||
|
|
||||||
|
<structure>
|
||||||
|
<id>
|
||||||
|
<name>id</name>
|
||||||
|
</id>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>UInt8_</name>
|
||||||
|
<type>UInt8</type>
|
||||||
|
<null_value>1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>UInt16_</name>
|
||||||
|
<type>UInt16</type>
|
||||||
|
<null_value>1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>UInt32_</name>
|
||||||
|
<type>UInt32</type>
|
||||||
|
<null_value>1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>UInt64_</name>
|
||||||
|
<type>UInt64</type>
|
||||||
|
<null_value></null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Int8_</name>
|
||||||
|
<type>Int8</type>
|
||||||
|
<null_value>-1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Int16_</name>
|
||||||
|
<type>Int16</type>
|
||||||
|
<null_value>-1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Int32_</name>
|
||||||
|
<type>Int32</type>
|
||||||
|
<null_value>-1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Int64_</name>
|
||||||
|
<type>Int64</type>
|
||||||
|
<null_value>-1</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Float32_</name>
|
||||||
|
<type>Float32</type>
|
||||||
|
<null_value>2.71828</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Float64_</name>
|
||||||
|
<type>Float64</type>
|
||||||
|
<null_value>2.71828</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>String_</name>
|
||||||
|
<type>String</type>
|
||||||
|
<null_value>implicit-default</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Date_</name>
|
||||||
|
<type>Date</type>
|
||||||
|
<null_value>2015-11-25</null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>DateTime_</name>
|
||||||
|
<type>DateTime</type>
|
||||||
|
<null_value></null_value>
|
||||||
|
</attribute>
|
||||||
|
|
||||||
|
<attribute>
|
||||||
|
<name>Parent</name>
|
||||||
|
<type>UInt64</type>
|
||||||
|
<hierarchical>true</hierarchical>
|
||||||
|
<null_value>0</null_value>
|
||||||
|
</attribute>
|
||||||
|
</structure>
|
||||||
|
</dictionary>
|
||||||
|
</yandex>
|
@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<profiles>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<users>
|
||||||
|
<default>
|
||||||
|
<password></password>
|
||||||
|
<networks incl="networks" replace="replace">
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<profile>default</profile>
|
||||||
|
<quota>default</quota>
|
||||||
|
</default>
|
||||||
|
</users>
|
||||||
|
|
||||||
|
<quotas>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</quotas>
|
||||||
|
</yandex>
|
45
dbms/tests/integration/test_dictionaries_null_value/test.py
Normal file
45
dbms/tests/integration/test_dictionaries_null_value/test.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
from helpers.test_tools import TSV, assert_eq_with_retry
|
||||||
|
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
DICTIONARY_FILES = ['configs/dictionaries/cache.xml']
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
|
||||||
|
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def started_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
|
||||||
|
instance.query('''
|
||||||
|
CREATE DATABASE IF NOT EXISTS test;
|
||||||
|
DROP TABLE IF EXISTS test.source;
|
||||||
|
CREATE TABLE test.source (id UInt64, key0 UInt8, key0_str String, key1 UInt8,
|
||||||
|
StartDate Date, EndDate Date,
|
||||||
|
UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64,
|
||||||
|
Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64,
|
||||||
|
Float32_ Float32, Float64_ Float64,
|
||||||
|
String_ String,
|
||||||
|
Date_ Date, DateTime_ DateTime, Parent UInt64) ENGINE=Log;
|
||||||
|
''')
|
||||||
|
|
||||||
|
yield cluster
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def test_null_value(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
assert query("select dictGetUInt8('cache', 'UInt8_', toUInt64(12121212))") == "1\n"
|
||||||
|
assert query("select dictGetString('cache', 'String_', toUInt64(12121212))") == "implicit-default\n"
|
||||||
|
assert query("select dictGetDate('cache', 'Date_', toUInt64(12121212))") == "2015-11-25\n"
|
||||||
|
|
||||||
|
# Check, that empty null_value interprets as default value
|
||||||
|
assert query("select dictGetUInt64('cache', 'UInt64_', toUInt64(12121212))") == "0\n"
|
||||||
|
assert query("select dictGetDateTime('cache', 'DateTime_', toUInt64(12121212))") == "0000-00-00 00:00:00\n"
|
@ -0,0 +1,30 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<logger>
|
||||||
|
<level>trace</level>
|
||||||
|
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
|
||||||
|
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
|
||||||
|
<size>1000M</size>
|
||||||
|
<count>10</count>
|
||||||
|
</logger>
|
||||||
|
|
||||||
|
<tcp_port>9000</tcp_port>
|
||||||
|
<listen_host>127.0.0.1</listen_host>
|
||||||
|
|
||||||
|
<openSSL>
|
||||||
|
<client>
|
||||||
|
<cacheSessions>true</cacheSessions>
|
||||||
|
<verificationMode>none</verificationMode>
|
||||||
|
<invalidCertificateHandler>
|
||||||
|
<name>AcceptCertificateHandler</name>
|
||||||
|
</invalidCertificateHandler>
|
||||||
|
</client>
|
||||||
|
</openSSL>
|
||||||
|
|
||||||
|
<max_concurrent_queries>500</max_concurrent_queries>
|
||||||
|
<mark_cache_size>5368709120</mark_cache_size>
|
||||||
|
<path>./clickhouse/</path>
|
||||||
|
<users_config>users.xml</users_config>
|
||||||
|
|
||||||
|
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
|
||||||
|
</yandex>
|
3
dbms/tests/integration/test_dictionaries_select_all/configs/dictionaries/.gitignore
vendored
Normal file
3
dbms/tests/integration/test_dictionaries_select_all/configs/dictionaries/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
||||||
|
!source.tsv
|
@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<profiles>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<users>
|
||||||
|
<default>
|
||||||
|
<password></password>
|
||||||
|
<networks incl="networks" replace="replace">
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<profile>default</profile>
|
||||||
|
<quota>default</quota>
|
||||||
|
</default>
|
||||||
|
</users>
|
||||||
|
|
||||||
|
<quotas>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</quotas>
|
||||||
|
</yandex>
|
@ -12,13 +12,6 @@ types = [
|
|||||||
'Date', 'DateTime'
|
'Date', 'DateTime'
|
||||||
]
|
]
|
||||||
|
|
||||||
explicit_defaults = [
|
|
||||||
'42', '42', '42', '42',
|
|
||||||
'-42', '-42', '-42', '-42',
|
|
||||||
'1.5', '1.6',
|
|
||||||
"'explicit-default'",
|
|
||||||
"'2015-01-01'", "'2015-01-01 00:00:00'"
|
|
||||||
]
|
|
||||||
|
|
||||||
implicit_defaults = [
|
implicit_defaults = [
|
||||||
'1', '1', '1', '',
|
'1', '1', '1', '',
|
||||||
@ -182,9 +175,6 @@ def generate_dictionaries(path, structure):
|
|||||||
|
|
||||||
file_names = []
|
file_names = []
|
||||||
|
|
||||||
# Add ready dictionaries.
|
|
||||||
file_names.extend(glob.glob(os.path.join(path, "*dictionary_preset*")))
|
|
||||||
|
|
||||||
# Generate dictionaries.
|
# Generate dictionaries.
|
||||||
for (name, key_idx, has_parent), (source, layout) in zip(structure, sources_and_layouts):
|
for (name, key_idx, has_parent), (source, layout) in zip(structure, sources_and_layouts):
|
||||||
filename = os.path.join(path, 'dictionary_%s.xml' % name)
|
filename = os.path.join(path, 'dictionary_%s.xml' % name)
|
122
dbms/tests/integration/test_dictionaries_select_all/test.py
Normal file
122
dbms/tests/integration/test_dictionaries_select_all/test.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
from helpers.test_tools import TSV, assert_eq_with_retry
|
||||||
|
from generate_dictionaries import generate_structure, generate_dictionaries, DictionaryTestTable
|
||||||
|
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
cluster = None
|
||||||
|
instance = None
|
||||||
|
test_table = None
|
||||||
|
|
||||||
|
|
||||||
|
def setup_module(module):
|
||||||
|
global cluster
|
||||||
|
global instance
|
||||||
|
global test_table
|
||||||
|
|
||||||
|
structure = generate_structure()
|
||||||
|
dictionary_files = generate_dictionaries(os.path.join(SCRIPT_DIR, 'configs/dictionaries'), structure)
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
|
||||||
|
instance = cluster.add_instance('instance', main_configs=dictionary_files)
|
||||||
|
test_table = DictionaryTestTable(os.path.join(SCRIPT_DIR, 'configs/dictionaries/source.tsv'))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def started_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
test_table.create_clickhouse_source(instance)
|
||||||
|
for line in TSV(instance.query('select name from system.dictionaries')).lines:
|
||||||
|
print line,
|
||||||
|
|
||||||
|
yield cluster
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=[
|
||||||
|
# name, keys, use_parent
|
||||||
|
('clickhouse_hashed', ('id',), True),
|
||||||
|
('clickhouse_flat', ('id',), True),
|
||||||
|
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
|
||||||
|
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
|
||||||
|
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
|
||||||
|
],
|
||||||
|
ids=['clickhouse_hashed', 'clickhouse_flat',
|
||||||
|
'clickhouse_complex_integers_key_hashed',
|
||||||
|
'clickhouse_complex_mixed_key_hashed',
|
||||||
|
'clickhouse_range_hashed']
|
||||||
|
)
|
||||||
|
def dictionary_structure(started_cluster, request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_all(dictionary_structure):
|
||||||
|
name, keys, use_parent = dictionary_structure
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
structure = test_table.get_structure_for_keys(keys, use_parent)
|
||||||
|
query('''
|
||||||
|
DROP TABLE IF EXISTS test.{0}
|
||||||
|
'''.format(name))
|
||||||
|
|
||||||
|
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
|
||||||
|
TSV(query(create_query))
|
||||||
|
|
||||||
|
result = TSV(query('select * from test.{0}'.format(name)))
|
||||||
|
|
||||||
|
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
|
||||||
|
print test_table.process_diff(diff)
|
||||||
|
assert not diff
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=[
|
||||||
|
# name, keys, use_parent
|
||||||
|
('clickhouse_cache', ('id',), True),
|
||||||
|
('clickhouse_complex_integers_key_cache', ('key0', 'key1'), False),
|
||||||
|
('clickhouse_complex_mixed_key_cache', ('key0_str', 'key1'), False)
|
||||||
|
],
|
||||||
|
ids=['clickhouse_cache', 'clickhouse_complex_integers_key_cache', 'clickhouse_complex_mixed_key_cache']
|
||||||
|
)
|
||||||
|
def cached_dictionary_structure(started_cluster, request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_all_from_cached(cached_dictionary_structure):
|
||||||
|
name, keys, use_parent = cached_dictionary_structure
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
structure = test_table.get_structure_for_keys(keys, use_parent)
|
||||||
|
query('''
|
||||||
|
DROP TABLE IF EXISTS test.{0}
|
||||||
|
'''.format(name))
|
||||||
|
|
||||||
|
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
|
||||||
|
TSV(query(create_query))
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
result = TSV(query('select * from test.{0}'.format(name)))
|
||||||
|
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=False)
|
||||||
|
print test_table.process_diff(diff)
|
||||||
|
assert not diff
|
||||||
|
|
||||||
|
key = []
|
||||||
|
for key_name in keys:
|
||||||
|
if key_name.endswith('str'):
|
||||||
|
key.append("'" + str(i) + "'")
|
||||||
|
else:
|
||||||
|
key.append(str(i))
|
||||||
|
if len(key) == 1:
|
||||||
|
key = 'toUInt64(' + str(i) + ')'
|
||||||
|
else:
|
||||||
|
key = str('(' + ','.join(key) + ')')
|
||||||
|
query("select dictGetUInt8('{0}', 'UInt8_', {1})".format(name, key))
|
||||||
|
|
||||||
|
result = TSV(query('select * from test.{0}'.format(name)))
|
||||||
|
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
|
||||||
|
print test_table.process_diff(diff)
|
||||||
|
assert not diff
|
@ -0,0 +1,30 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<logger>
|
||||||
|
<level>trace</level>
|
||||||
|
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
|
||||||
|
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
|
||||||
|
<size>1000M</size>
|
||||||
|
<count>10</count>
|
||||||
|
</logger>
|
||||||
|
|
||||||
|
<tcp_port>9000</tcp_port>
|
||||||
|
<listen_host>127.0.0.1</listen_host>
|
||||||
|
|
||||||
|
<openSSL>
|
||||||
|
<client>
|
||||||
|
<cacheSessions>true</cacheSessions>
|
||||||
|
<verificationMode>none</verificationMode>
|
||||||
|
<invalidCertificateHandler>
|
||||||
|
<name>AcceptCertificateHandler</name>
|
||||||
|
</invalidCertificateHandler>
|
||||||
|
</client>
|
||||||
|
</openSSL>
|
||||||
|
|
||||||
|
<max_concurrent_queries>500</max_concurrent_queries>
|
||||||
|
<mark_cache_size>5368709120</mark_cache_size>
|
||||||
|
<path>./clickhouse/</path>
|
||||||
|
<users_config>users.xml</users_config>
|
||||||
|
|
||||||
|
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
|
||||||
|
</yandex>
|
@ -1,7 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<yandex>
|
<yandex>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<name>cmd</name>
|
<name>executable</name>
|
||||||
<source>
|
<source>
|
||||||
<executable>
|
<executable>
|
||||||
<command>echo '7\t8';</command>
|
<command>echo '7\t8';</command>
|
@ -4,7 +4,7 @@
|
|||||||
<name>file</name>
|
<name>file</name>
|
||||||
<source>
|
<source>
|
||||||
<file>
|
<file>
|
||||||
<path>/etc/clickhouse-server/config.d/dictionary_preset_file.txt</path>
|
<path>/etc/clickhouse-server/config.d/file.txt</path>
|
||||||
<format>TabSeparated</format>
|
<format>TabSeparated</format>
|
||||||
</file>
|
</file>
|
||||||
</source>
|
</source>
|
||||||
@ -21,7 +21,7 @@
|
|||||||
<name>no_file</name>
|
<name>no_file</name>
|
||||||
<source>
|
<source>
|
||||||
<file>
|
<file>
|
||||||
<path>/etc/clickhouse-server/config.d/dictionary_preset_no_file.txt</path>
|
<path>/etc/clickhouse-server/config.d/no_file.txt</path>
|
||||||
<format>TabSeparated</format>
|
<format>TabSeparated</format>
|
||||||
</file>
|
</file>
|
||||||
</source>
|
</source>
|
||||||
@ -38,7 +38,7 @@
|
|||||||
<name>no_file_2</name>
|
<name>no_file_2</name>
|
||||||
<source>
|
<source>
|
||||||
<file>
|
<file>
|
||||||
<path>/etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt</path>
|
<path>/etc/clickhouse-server/config.d/no_file_2.txt</path>
|
||||||
<format>TabSeparated</format>
|
<format>TabSeparated</format>
|
||||||
</file>
|
</file>
|
||||||
</source>
|
</source>
|
@ -1,7 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<yandex>
|
<yandex>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<name>longload</name>
|
<name>slow</name>
|
||||||
<source>
|
<source>
|
||||||
<executable>
|
<executable>
|
||||||
<command>sleep 100 && echo '5\t6';</command>
|
<command>sleep 100 && echo '5\t6';</command>
|
@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<profiles>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<users>
|
||||||
|
<default>
|
||||||
|
<password></password>
|
||||||
|
<networks incl="networks" replace="replace">
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<profile>default</profile>
|
||||||
|
<quota>default</quota>
|
||||||
|
</default>
|
||||||
|
</users>
|
||||||
|
|
||||||
|
<quotas>
|
||||||
|
<default>
|
||||||
|
</default>
|
||||||
|
</quotas>
|
||||||
|
</yandex>
|
@ -0,0 +1,246 @@
|
|||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
from helpers.test_tools import assert_eq_with_retry
|
||||||
|
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
DICTIONARY_FILES = ['configs/dictionaries/cache_xypairs.xml', 'configs/dictionaries/executable.xml', 'configs/dictionaries/file.xml', 'configs/dictionaries/file.txt', 'configs/dictionaries/slow.xml']
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
|
||||||
|
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def started_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
instance.query("CREATE DATABASE IF NOT EXISTS test")
|
||||||
|
|
||||||
|
yield cluster
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def get_status(dictionary_name):
|
||||||
|
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_exception(dictionary_name):
|
||||||
|
return instance.query("SELECT last_exception FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n").replace("\\'", "'")
|
||||||
|
|
||||||
|
|
||||||
|
def get_loading_start_time(dictionary_name):
|
||||||
|
s = instance.query("SELECT loading_start_time FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
|
||||||
|
if s == "0000-00-00 00:00:00":
|
||||||
|
return None
|
||||||
|
return time.strptime(s, "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
|
def get_loading_duration(dictionary_name):
|
||||||
|
return float(instance.query("SELECT loading_duration FROM system.dictionaries WHERE name='" + dictionary_name + "'"))
|
||||||
|
|
||||||
|
|
||||||
|
def replace_in_file_in_container(file_name, what, replace_with):
|
||||||
|
instance.exec_in_container('sed -i "s/' + what + '/' + replace_with + '/g" ' + file_name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_reload_while_loading(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
||||||
|
assert get_status('slow') == "NOT_LOADED"
|
||||||
|
assert get_loading_duration('slow') == 0
|
||||||
|
|
||||||
|
# It's not possible to get a value from the dictionary within 1.0 second, so the following query fails by timeout.
|
||||||
|
assert query("SELECT dictGetInt32('slow', 'a', toUInt64(5))", timeout = 1, ignore_error = True) == ""
|
||||||
|
|
||||||
|
# The dictionary is now loading.
|
||||||
|
assert get_status('slow') == "LOADING"
|
||||||
|
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
|
||||||
|
assert duration > 0
|
||||||
|
|
||||||
|
time.sleep(0.5) # Still loading.
|
||||||
|
assert get_status('slow') == "LOADING"
|
||||||
|
prev_start_time, prev_duration = start_time, duration
|
||||||
|
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
|
||||||
|
assert start_time == prev_start_time
|
||||||
|
assert duration >= prev_duration
|
||||||
|
|
||||||
|
# SYSTEM RELOAD DICTIONARY should restart loading.
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'slow'")
|
||||||
|
assert get_status('slow') == "LOADING"
|
||||||
|
prev_start_time, prev_duration = start_time, duration
|
||||||
|
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
|
||||||
|
assert start_time > prev_start_time
|
||||||
|
assert duration < prev_duration
|
||||||
|
|
||||||
|
time.sleep(0.5) # Still loading.
|
||||||
|
assert get_status('slow') == "LOADING"
|
||||||
|
prev_start_time, prev_duration = start_time, duration
|
||||||
|
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
|
||||||
|
assert start_time == prev_start_time
|
||||||
|
assert duration >= prev_duration
|
||||||
|
|
||||||
|
# SYSTEM RELOAD DICTIONARIES should restart loading again.
|
||||||
|
query("SYSTEM RELOAD DICTIONARIES")
|
||||||
|
assert get_status('slow') == "LOADING"
|
||||||
|
prev_start_time, prev_duration = start_time, duration
|
||||||
|
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
|
||||||
|
assert start_time > prev_start_time
|
||||||
|
assert duration < prev_duration
|
||||||
|
|
||||||
|
# Changing the configuration file should restart loading one more time.
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/slow.xml', 'sleep 100', 'sleep 0')
|
||||||
|
time.sleep(5) # Configuration files are reloaded once in 5 seconds.
|
||||||
|
|
||||||
|
# This time loading should finish quickly.
|
||||||
|
assert get_status('slow') == "LOADED"
|
||||||
|
assert query("SELECT dictGetInt32('slow', 'a', toUInt64(5))") == "6\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_reload_after_loading(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "8\n"
|
||||||
|
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
|
||||||
|
|
||||||
|
# Change the dictionaries' data.
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '8', '81')
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '10', '101')
|
||||||
|
|
||||||
|
# SYSTEM RELOAD 'name' reloads only the specified dictionary.
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'executable'")
|
||||||
|
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "81\n"
|
||||||
|
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
|
||||||
|
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'file'")
|
||||||
|
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "81\n"
|
||||||
|
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "101\n"
|
||||||
|
|
||||||
|
# SYSTEM RELOAD DICTIONARIES reloads all loaded dictionaries.
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82')
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102')
|
||||||
|
query("SYSTEM RELOAD DICTIONARIES")
|
||||||
|
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "82\n"
|
||||||
|
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n"
|
||||||
|
|
||||||
|
# Configuration files are reloaded and lifetimes are checked automatically once in 5 seconds.
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '82', '83')
|
||||||
|
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '102', '103')
|
||||||
|
time.sleep(5)
|
||||||
|
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n"
|
||||||
|
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "83\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_reload_after_fail_by_system_reload(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
||||||
|
assert get_status("no_file") == "NOT_LOADED"
|
||||||
|
|
||||||
|
# We expect an error because the file source doesn't exist.
|
||||||
|
expected_error = "No such file"
|
||||||
|
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
|
||||||
|
assert get_status("no_file") == "FAILED"
|
||||||
|
|
||||||
|
# SYSTEM RELOAD should not change anything now, the status is still FAILED.
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
||||||
|
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
|
||||||
|
assert get_status("no_file") == "FAILED"
|
||||||
|
|
||||||
|
# Creating the file source makes the dictionary able to load.
|
||||||
|
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/config.d/no_file.txt")
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
||||||
|
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
|
||||||
|
assert get_status("no_file") == "LOADED"
|
||||||
|
|
||||||
|
# Removing the file source should not spoil the loaded dictionary.
|
||||||
|
instance.exec_in_container("rm /etc/clickhouse-server/config.d/no_file.txt")
|
||||||
|
query("SYSTEM RELOAD DICTIONARY 'no_file'")
|
||||||
|
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
|
||||||
|
assert get_status("no_file") == "LOADED"
|
||||||
|
|
||||||
|
|
||||||
|
def test_reload_after_fail_by_timer(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
|
||||||
|
# dictionaries_lazy_load == false, so this dictionary is not loaded.
|
||||||
|
assert get_status("no_file_2") == "NOT_LOADED"
|
||||||
|
|
||||||
|
# We expect an error because the file source doesn't exist.
|
||||||
|
expected_error = "No such file"
|
||||||
|
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
|
||||||
|
assert get_status("no_file_2") == "FAILED"
|
||||||
|
|
||||||
|
# Passed time should not change anything now, the status is still FAILED.
|
||||||
|
time.sleep(6);
|
||||||
|
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
|
||||||
|
assert get_status("no_file_2") == "FAILED"
|
||||||
|
|
||||||
|
# Creating the file source makes the dictionary able to load.
|
||||||
|
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/config.d/no_file_2.txt")
|
||||||
|
time.sleep(6);
|
||||||
|
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
|
||||||
|
assert get_status("no_file_2") == "LOADED"
|
||||||
|
|
||||||
|
# Removing the file source should not spoil the loaded dictionary.
|
||||||
|
instance.exec_in_container("rm /etc/clickhouse-server/config.d/no_file_2.txt")
|
||||||
|
time.sleep(6);
|
||||||
|
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
|
||||||
|
assert get_status("no_file_2") == "LOADED"
|
||||||
|
|
||||||
|
|
||||||
|
def test_reload_after_fail_in_cache_dictionary(started_cluster):
|
||||||
|
query = instance.query
|
||||||
|
query_and_get_error = instance.query_and_get_error
|
||||||
|
|
||||||
|
# Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond.
|
||||||
|
expected_error = "Table test.xypairs doesn't exist"
|
||||||
|
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))")
|
||||||
|
assert get_status("cache_xypairs") == "LOADED"
|
||||||
|
assert expected_error in get_last_exception("cache_xypairs")
|
||||||
|
|
||||||
|
# Create table `test.xypairs`.
|
||||||
|
query('''
|
||||||
|
DROP TABLE IF EXISTS test.xypairs;
|
||||||
|
CREATE TABLE test.xypairs (x UInt64, y UInt64) ENGINE=Log;
|
||||||
|
INSERT INTO test.xypairs VALUES (1, 56), (3, 78);
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Cache dictionary now works.
|
||||||
|
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "56", ignore_error=True)
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
||||||
|
assert get_last_exception("cache_xypairs") == ""
|
||||||
|
|
||||||
|
# Drop table `test.xypairs`.
|
||||||
|
query('DROP TABLE test.xypairs')
|
||||||
|
|
||||||
|
# Values are cached so we can get them.
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
||||||
|
assert get_last_exception("cache_xypairs") == ""
|
||||||
|
|
||||||
|
# But we can't get a value from the source table which isn't cached.
|
||||||
|
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
|
||||||
|
assert expected_error in get_last_exception("cache_xypairs")
|
||||||
|
|
||||||
|
# Passed time should not spoil the cache.
|
||||||
|
time.sleep(5);
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
||||||
|
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
|
||||||
|
assert expected_error in get_last_exception("cache_xypairs")
|
||||||
|
|
||||||
|
# Create table `test.xypairs` again with changed values.
|
||||||
|
query('''
|
||||||
|
CREATE TABLE test.xypairs (x UInt64, y UInt64) ENGINE=Log;
|
||||||
|
INSERT INTO test.xypairs VALUES (1, 57), (3, 79);
|
||||||
|
''')
|
||||||
|
|
||||||
|
# The cache dictionary returns new values now.
|
||||||
|
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "57")
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
|
||||||
|
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(3))") == "79"
|
||||||
|
assert get_last_exception("cache_xypairs") == ""
|
@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<yandex>
|
||||||
|
<timezone>America/Los_Angeles</timezone>
|
||||||
|
</yandex>
|
17
dbms/tests/integration/test_timezone_config/test.py
Normal file
17
dbms/tests/integration/test_timezone_config/test.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__)
|
||||||
|
node = cluster.add_instance('node', main_configs=['configs/config.xml'])
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def start_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
yield cluster
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
def test_check_timezone_config(start_cluster):
|
||||||
|
assert node.query("SELECT toDateTime(1111111111)") == "2005-03-17 17:58:31\n"
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user