Merge branch 'master' of github.com:yandex/ClickHouse

This commit is contained in:
BayoNet 2019-09-03 11:04:51 +03:00
commit 8b55348a45
126 changed files with 3991 additions and 2676 deletions

1
.gitignore vendored
View File

@ -90,7 +90,6 @@ dbms/src/Core/tests/field
dbms/src/Core/tests/rvo_test dbms/src/Core/tests/rvo_test
dbms/src/Core/tests/string_pool dbms/src/Core/tests/string_pool
dbms/src/DataStreams/tests/aggregating_stream dbms/src/DataStreams/tests/aggregating_stream
dbms/src/DataStreams/tests/block_row_transforms
dbms/src/DataStreams/tests/block_tab_separated_streams dbms/src/DataStreams/tests/block_tab_separated_streams
dbms/src/DataStreams/tests/collapsing_sorted_stream dbms/src/DataStreams/tests/collapsing_sorted_stream
dbms/src/DataStreams/tests/expression_stream dbms/src/DataStreams/tests/expression_stream

View File

@ -21,6 +21,7 @@
#include <Common/StringUtils/StringUtils.h> #include <Common/StringUtils/StringUtils.h>
#include <common/phdr_cache.h> #include <common/phdr_cache.h>
#include <ext/scope_guard.h>
/// Universal executable for various clickhouse applications /// Universal executable for various clickhouse applications
@ -130,8 +131,19 @@ bool isClickhouseApp(const std::string & app_suffix, std::vector<char *> & argv)
} }
/// This allows to implement assert to forbid initialization of a class in static constructors.
/// Usage:
///
/// extern bool inside_main;
/// class C { C() { assert(inside_main); } };
bool inside_main = false;
int main(int argc_, char ** argv_) int main(int argc_, char ** argv_)
{ {
inside_main = true;
SCOPE_EXIT({ inside_main = false; });
/// Reset new handler to default (that throws std::bad_alloc) /// Reset new handler to default (that throws std::bad_alloc)
/// It is needed because LLVM library clobbers it. /// It is needed because LLVM library clobbers it.
std::set_new_handler(nullptr); std::set_new_handler(nullptr);

View File

@ -447,6 +447,8 @@ namespace ErrorCodes
extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW = 470; extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW = 470;
extern const int SETTINGS_ARE_NOT_SUPPORTED = 471; extern const int SETTINGS_ARE_NOT_SUPPORTED = 471;
extern const int READONLY_SETTING = 472; extern const int READONLY_SETTING = 472;
extern const int DEADLOCK_AVOIDED = 473;
extern const int INVALID_TEMPLATE_FORMAT = 474;
extern const int KEEPER_EXCEPTION = 999; extern const int KEEPER_EXCEPTION = 999;
extern const int POCO_EXCEPTION = 1000; extern const int POCO_EXCEPTION = 1000;

View File

@ -4,6 +4,8 @@
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <cassert>
namespace ProfileEvents namespace ProfileEvents
{ {
@ -29,6 +31,7 @@ namespace DB
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int LOGICAL_ERROR; extern const int LOGICAL_ERROR;
extern const int DEADLOCK_AVOIDED;
} }
@ -37,7 +40,6 @@ class RWLockImpl::LockHolderImpl
RWLock parent; RWLock parent;
GroupsContainer::iterator it_group; GroupsContainer::iterator it_group;
ClientsContainer::iterator it_client; ClientsContainer::iterator it_client;
ThreadToHolder::key_type thread_id;
QueryIdToHolder::key_type query_id; QueryIdToHolder::key_type query_id;
CurrentMetrics::Increment active_client_increment; CurrentMetrics::Increment active_client_increment;
@ -53,6 +55,44 @@ public:
}; };
namespace
{
/// Global information about all read locks that query has. It is needed to avoid some type of deadlocks.
class QueryLockInfo
{
private:
std::mutex mutex;
std::map<std::string, size_t> queries;
public:
void add(const String & query_id)
{
std::lock_guard lock(mutex);
++queries[query_id];
}
void remove(const String & query_id)
{
std::lock_guard lock(mutex);
auto it = queries.find(query_id);
assert(it != queries.end());
if (--it->second == 0)
queries.erase(it);
}
void check(const String & query_id)
{
std::lock_guard lock(mutex);
if (queries.count(query_id))
throw Exception("Possible deadlock avoided. Client should retry.", ErrorCodes::DEADLOCK_AVOIDED);
}
};
QueryLockInfo all_read_locks;
}
RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & query_id) RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String & query_id)
{ {
Stopwatch watch(CLOCK_MONOTONIC_COARSE); Stopwatch watch(CLOCK_MONOTONIC_COARSE);
@ -69,34 +109,48 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
GroupsContainer::iterator it_group; GroupsContainer::iterator it_group;
ClientsContainer::iterator it_client; ClientsContainer::iterator it_client;
/// This object is placed above unique_lock, because it may lock in destructor.
LockHolder res;
std::unique_lock lock(mutex); std::unique_lock lock(mutex);
/// Check if the same query is acquiring previously acquired lock /// Check if the same query is acquiring previously acquired lock
LockHolder existing_holder_ptr;
auto this_thread_id = std::this_thread::get_id();
auto it_thread = thread_to_holder.find(this_thread_id);
auto it_query = query_id_to_holder.end();
if (query_id != RWLockImpl::NO_QUERY) if (query_id != RWLockImpl::NO_QUERY)
it_query = query_id_to_holder.find(query_id); {
auto it_query = query_id_to_holder.find(query_id);
if (it_query != query_id_to_holder.end())
res = it_query->second.lock();
}
if (it_thread != thread_to_holder.end()) if (res)
existing_holder_ptr = it_thread->second.lock();
else if (it_query != query_id_to_holder.end())
existing_holder_ptr = it_query->second.lock();
if (existing_holder_ptr)
{ {
/// XXX: it means we can't upgrade lock from read to write - with proper waiting! /// XXX: it means we can't upgrade lock from read to write - with proper waiting!
if (type != Read || existing_holder_ptr->it_group->type != Read) if (type != Read || res->it_group->type != Read)
throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR); throw Exception("Attempt to acquire exclusive lock recursively", ErrorCodes::LOGICAL_ERROR);
else
return existing_holder_ptr; return res;
} }
/** If the query already has any active read lock and tries to acquire another read lock
* but it is not in front of the queue and has to wait, deadlock is possible:
*
* Example (four queries, two RWLocks - 'a' and 'b'):
*
* --> time -->
*
* q1: ra rb
* q2: wa
* q3: rb ra
* q4: wb
*
* We will throw an exception instead.
*/
if (type == Type::Write || queue.empty() || queue.back().type == Type::Write) if (type == Type::Write || queue.empty() || queue.back().type == Type::Write)
{ {
if (type == Type::Read && !queue.empty() && queue.back().type == Type::Write && query_id != RWLockImpl::NO_QUERY)
all_read_locks.check(query_id);
/// Create new group of clients /// Create new group of clients
it_group = queue.emplace(queue.end(), type); it_group = queue.emplace(queue.end(), type);
} }
@ -104,6 +158,9 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
{ {
/// Will append myself to last group /// Will append myself to last group
it_group = std::prev(queue.end()); it_group = std::prev(queue.end());
if (it_group != queue.begin() && query_id != RWLockImpl::NO_QUERY)
all_read_locks.check(query_id);
} }
/// Append myself to the end of chosen group /// Append myself to the end of chosen group
@ -120,17 +177,19 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
throw; throw;
} }
LockHolder res(new LockHolderImpl(shared_from_this(), it_group, it_client)); res.reset(new LockHolderImpl(shared_from_this(), it_group, it_client));
/// Wait a notification until we will be the only in the group. /// Wait a notification until we will be the only in the group.
it_group->cv.wait(lock, [&] () { return it_group == queue.begin(); }); it_group->cv.wait(lock, [&] () { return it_group == queue.begin(); });
/// Insert myself (weak_ptr to the holder) to threads set to implement recursive lock /// Insert myself (weak_ptr to the holder) to queries set to implement recursive lock
thread_to_holder.emplace(this_thread_id, res);
res->thread_id = this_thread_id;
if (query_id != RWLockImpl::NO_QUERY) if (query_id != RWLockImpl::NO_QUERY)
{
query_id_to_holder.emplace(query_id, res); query_id_to_holder.emplace(query_id, res);
if (type == Type::Read)
all_read_locks.add(query_id);
}
res->query_id = query_id; res->query_id = query_id;
finalize_metrics(); finalize_metrics();
@ -140,12 +199,14 @@ RWLockImpl::LockHolder RWLockImpl::getLock(RWLockImpl::Type type, const String &
RWLockImpl::LockHolderImpl::~LockHolderImpl() RWLockImpl::LockHolderImpl::~LockHolderImpl()
{ {
std::unique_lock lock(parent->mutex); std::lock_guard lock(parent->mutex);
/// Remove weak_ptrs to the holder, since there are no owners of the current lock /// Remove weak_ptrs to the holder, since there are no owners of the current lock
parent->thread_to_holder.erase(thread_id);
parent->query_id_to_holder.erase(query_id); parent->query_id_to_holder.erase(query_id);
if (*it_client == RWLockImpl::Read && query_id != RWLockImpl::NO_QUERY)
all_read_locks.remove(query_id);
/// Removes myself from client list of our group /// Removes myself from client list of our group
it_group->clients.erase(it_client); it_group->clients.erase(it_client);
@ -166,6 +227,7 @@ RWLockImpl::LockHolderImpl::LockHolderImpl(RWLock && parent_, RWLockImpl::Groups
: parent{std::move(parent_)}, it_group{it_group_}, it_client{it_client_}, : parent{std::move(parent_)}, it_group{it_group_}, it_client{it_client_},
active_client_increment{(*it_client == RWLockImpl::Read) ? CurrentMetrics::RWLockActiveReaders active_client_increment{(*it_client == RWLockImpl::Read) ? CurrentMetrics::RWLockActiveReaders
: CurrentMetrics::RWLockActiveWriters} : CurrentMetrics::RWLockActiveWriters}
{} {
}
} }

View File

@ -6,7 +6,6 @@
#include <vector> #include <vector>
#include <mutex> #include <mutex>
#include <condition_variable> #include <condition_variable>
#include <thread>
#include <map> #include <map>
#include <string> #include <string>
@ -19,7 +18,7 @@ using RWLock = std::shared_ptr<RWLockImpl>;
/// Implements shared lock with FIFO service /// Implements shared lock with FIFO service
/// Can be acquired recursively (several calls for the same query or the same OS thread) in Read mode /// Can be acquired recursively (several calls for the same query) in Read mode
/// ///
/// NOTE: it is important to allow acquiring the same lock in Read mode without waiting if it is already /// NOTE: it is important to allow acquiring the same lock in Read mode without waiting if it is already
/// acquired by another thread of the same query. Otherwise the following deadlock is possible: /// acquired by another thread of the same query. Otherwise the following deadlock is possible:
@ -55,7 +54,6 @@ private:
struct Group; struct Group;
using GroupsContainer = std::list<Group>; using GroupsContainer = std::list<Group>;
using ClientsContainer = std::list<Type>; using ClientsContainer = std::list<Type>;
using ThreadToHolder = std::map<std::thread::id, std::weak_ptr<LockHolderImpl>>;
using QueryIdToHolder = std::map<String, std::weak_ptr<LockHolderImpl>>; using QueryIdToHolder = std::map<String, std::weak_ptr<LockHolderImpl>>;
/// Group of clients that should be executed concurrently /// Group of clients that should be executed concurrently
@ -73,7 +71,6 @@ private:
mutable std::mutex mutex; mutable std::mutex mutex;
GroupsContainer queue; GroupsContainer queue;
ThreadToHolder thread_to_holder;
QueryIdToHolder query_id_to_holder; QueryIdToHolder query_id_to_holder;
}; };

View File

@ -13,6 +13,14 @@
using namespace DB; using namespace DB;
namespace DB
{
namespace ErrorCodes
{
extern const int DEADLOCK_AVOIDED;
}
}
TEST(Common, RWLock_1) TEST(Common, RWLock_1)
{ {
@ -94,7 +102,7 @@ TEST(Common, RWLock_Recursive)
{ {
for (int i = 0; i < 2 * cycles; ++i) for (int i = 0; i < 2 * cycles; ++i)
{ {
auto lock = fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY); auto lock = fifo_lock->getLock(RWLockImpl::Write, "q1");
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen)); auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
std::this_thread::sleep_for(sleep_for); std::this_thread::sleep_for(sleep_for);
@ -105,17 +113,17 @@ TEST(Common, RWLock_Recursive)
{ {
for (int i = 0; i < cycles; ++i) for (int i = 0; i < cycles; ++i)
{ {
auto lock1 = fifo_lock->getLock(RWLockImpl::Read, RWLockImpl::NO_QUERY); auto lock1 = fifo_lock->getLock(RWLockImpl::Read, "q2");
auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen)); auto sleep_for = std::chrono::duration<int, std::micro>(std::uniform_int_distribution<>(1, 100)(gen));
std::this_thread::sleep_for(sleep_for); std::this_thread::sleep_for(sleep_for);
auto lock2 = fifo_lock->getLock(RWLockImpl::Read, RWLockImpl::NO_QUERY); auto lock2 = fifo_lock->getLock(RWLockImpl::Read, "q2");
EXPECT_ANY_THROW({fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY);}); EXPECT_ANY_THROW({fifo_lock->getLock(RWLockImpl::Write, "q2");});
} }
fifo_lock->getLock(RWLockImpl::Write, RWLockImpl::NO_QUERY); fifo_lock->getLock(RWLockImpl::Write, "q2");
}); });
t1.join(); t1.join();
@ -123,6 +131,74 @@ TEST(Common, RWLock_Recursive)
} }
TEST(Common, RWLock_Deadlock)
{
static auto lock1 = RWLockImpl::create();
static auto lock2 = RWLockImpl::create();
/**
* q1: r1 r2
* q2: w1
* q3: r2 r1
* q4: w2
*/
std::thread t1([&] ()
{
auto holder1 = lock1->getLock(RWLockImpl::Read, "q1");
usleep(100000);
usleep(100000);
usleep(100000);
try
{
auto holder2 = lock2->getLock(RWLockImpl::Read, "q1");
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::DEADLOCK_AVOIDED)
throw;
}
});
std::thread t2([&] ()
{
usleep(100000);
auto holder1 = lock1->getLock(RWLockImpl::Write, "q2");
});
std::thread t3([&] ()
{
usleep(100000);
usleep(100000);
auto holder2 = lock2->getLock(RWLockImpl::Read, "q3");
usleep(100000);
usleep(100000);
try
{
auto holder1 = lock1->getLock(RWLockImpl::Read, "q3");
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::DEADLOCK_AVOIDED)
throw;
}
});
std::thread t4([&] ()
{
usleep(100000);
usleep(100000);
usleep(100000);
auto holder2 = lock2->getLock(RWLockImpl::Write, "q4");
});
t1.join();
t2.join();
t3.join();
t4.join();
}
TEST(Common, RWLock_PerfTest_Readers) TEST(Common, RWLock_PerfTest_Readers)
{ {
constexpr int cycles = 100000; // 100k constexpr int cycles = 100000; // 100k

View File

@ -216,6 +216,8 @@ struct Settings : public SettingsCollection<Settings>
M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \ M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \
M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \ M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \
M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \ M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \
M(SettingString, format_schema_rows, "", "Row format string for Template format") \
M(SettingString, format_schema_rows_between_delimiter, "\n", "Delimiter between rows for Template format") \
M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \ M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \
M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \ M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \
M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \ M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \

View File

@ -1,178 +0,0 @@
#include <Common/Exception.h>
#include <IO/WriteHelpers.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
#include <common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
extern const int CANNOT_PARSE_QUOTED_STRING;
extern const int CANNOT_PARSE_DATE;
extern const int CANNOT_PARSE_DATETIME;
extern const int CANNOT_READ_ARRAY_FROM_TEXT;
extern const int CANNOT_PARSE_NUMBER;
extern const int CANNOT_PARSE_UUID;
extern const int TOO_LARGE_STRING_SIZE;
extern const int CANNOT_READ_ALL_DATA;
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
}
BlockInputStreamFromRowInputStream::BlockInputStreamFromRowInputStream(
const RowInputStreamPtr & row_input_,
const Block & sample_,
UInt64 max_block_size_,
UInt64 rows_portion_size_,
FormatFactory::ReadCallback callback,
const FormatSettings & settings)
: row_input(row_input_)
, sample(sample_)
, max_block_size(max_block_size_)
, rows_portion_size(rows_portion_size_)
, read_virtual_columns_callback(callback)
, allow_errors_num(settings.input_allow_errors_num)
, allow_errors_ratio(settings.input_allow_errors_ratio)
{
}
static bool isParseError(int code)
{
return code == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED
|| code == ErrorCodes::CANNOT_PARSE_QUOTED_STRING
|| code == ErrorCodes::CANNOT_PARSE_DATE
|| code == ErrorCodes::CANNOT_PARSE_DATETIME
|| code == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
|| code == ErrorCodes::CANNOT_PARSE_NUMBER
|| code == ErrorCodes::CANNOT_PARSE_UUID
|| code == ErrorCodes::TOO_LARGE_STRING_SIZE
|| code == ErrorCodes::CANNOT_READ_ALL_DATA
|| code == ErrorCodes::INCORRECT_DATA;
}
Block BlockInputStreamFromRowInputStream::readImpl()
{
size_t num_columns = sample.columns();
MutableColumns columns = sample.cloneEmptyColumns();
block_missing_values.clear();
try
{
for (size_t rows = 0, batch = 0; rows < max_block_size; ++rows, ++batch)
{
if (rows_portion_size && batch == rows_portion_size)
{
batch = 0;
if (!checkTimeLimit() || isCancelled())
break;
}
try
{
++total_rows;
RowReadExtension info_;
if (!row_input->read(columns, info_))
break;
if (read_virtual_columns_callback)
read_virtual_columns_callback();
for (size_t column_idx = 0; column_idx < info_.read_columns.size(); ++column_idx)
{
if (!info_.read_columns[column_idx])
{
size_t column_size = columns[column_idx]->size();
if (column_size == 0)
throw Exception("Unexpected empty column", ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS);
block_missing_values.setBit(column_idx, column_size - 1);
}
}
}
catch (Exception & e)
{
/// Logic for possible skipping of errors.
if (!isParseError(e.code()))
throw;
if (allow_errors_num == 0 && allow_errors_ratio == 0)
throw;
++num_errors;
Float32 current_error_ratio = static_cast<Float32>(num_errors) / total_rows;
if (num_errors > allow_errors_num
&& current_error_ratio > allow_errors_ratio)
{
e.addMessage("(Already have " + toString(num_errors) + " errors"
" out of " + toString(total_rows) + " rows"
", which is " + toString(current_error_ratio) + " of all rows)");
throw;
}
if (!row_input->allowSyncAfterError())
{
e.addMessage("(Input format doesn't allow to skip errors)");
throw;
}
row_input->syncAfterError();
/// Truncate all columns in block to minimal size (remove values, that was appended to only part of columns).
size_t min_size = std::numeric_limits<size_t>::max();
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
min_size = std::min(min_size, columns[column_idx]->size());
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
{
auto & column = columns[column_idx];
if (column->size() > min_size)
column->popBack(column->size() - min_size);
}
}
}
}
catch (Exception & e)
{
if (!isParseError(e.code()))
throw;
String verbose_diagnostic;
try
{
verbose_diagnostic = row_input->getDiagnosticInfo();
}
catch (...)
{
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
}
e.addMessage("(at row " + toString(total_rows) + ")\n" + verbose_diagnostic);
throw;
}
if (columns.empty() || columns[0]->empty())
return {};
return sample.cloneWithColumns(std::move(columns));
}
void BlockInputStreamFromRowInputStream::readSuffix()
{
if (allow_errors_num > 0 || allow_errors_ratio > 0)
{
Logger * log = &Logger::get("BlockInputStreamFromRowInputStream");
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
}
row_input->readSuffix();
}
}

View File

@ -1,62 +0,0 @@
#pragma once
#include <Core/Defines.h>
#include <DataStreams/IBlockInputStream.h>
#include <Formats/FormatFactory.h>
#include <Formats/FormatSettings.h>
#include <Formats/IRowInputStream.h>
namespace DB
{
/** Makes block-oriented stream on top of row-oriented stream.
* It is used to read data from text formats.
*
* Also controls over parsing errors and prints diagnostic information about them.
*/
class BlockInputStreamFromRowInputStream : public IBlockInputStream
{
public:
/// |sample| is a block with zero rows, that structure describes how to interpret values
/// |rows_portion_size| is a number of rows to read before break and check limits
BlockInputStreamFromRowInputStream(
const RowInputStreamPtr & row_input_,
const Block & sample_,
UInt64 max_block_size_,
UInt64 rows_portion_size_,
FormatFactory::ReadCallback callback,
const FormatSettings & settings);
void readPrefix() override { row_input->readPrefix(); }
void readSuffix() override;
String getName() const override { return "BlockInputStreamFromRowInputStream"; }
RowInputStreamPtr & getRowInput() { return row_input; }
Block getHeader() const override { return sample; }
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
protected:
Block readImpl() override;
private:
RowInputStreamPtr row_input;
Block sample;
UInt64 max_block_size;
UInt64 rows_portion_size;
/// Callback used to setup virtual columns after reading each row.
FormatFactory::ReadCallback read_virtual_columns_callback;
BlockMissingValues block_missing_values;
UInt64 allow_errors_num;
Float32 allow_errors_ratio;
size_t total_rows = 0;
size_t num_errors = 0;
};
}

View File

@ -1,573 +0,0 @@
#include <Core/Defines.h>
#include <IO/ConcatReadBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
#include <Formats/verbosePrintString.h>
#include <Formats/CSVRowInputStream.h>
#include <Formats/FormatFactory.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
#include <DataTypes/DataTypeNullable.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int LOGICAL_ERROR;
}
static inline void skipEndOfLine(ReadBuffer & istr)
{
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
if (*istr.position() == '\n')
{
++istr.position();
if (!istr.eof() && *istr.position() == '\r')
++istr.position();
}
else if (*istr.position() == '\r')
{
++istr.position();
if (!istr.eof() && *istr.position() == '\n')
++istr.position();
else
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
}
else if (!istr.eof())
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
}
static inline void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column)
{
if (is_last_column)
{
if (istr.eof())
return;
/// we support the extra delimiter at the end of the line
if (*istr.position() == delimiter)
{
++istr.position();
if (istr.eof())
return;
}
skipEndOfLine(istr);
}
else
assertChar(delimiter, istr);
}
/// Skip `whitespace` symbols allowed in CSV.
static inline void skipWhitespacesAndTabs(ReadBuffer & buf)
{
while (!buf.eof()
&& (*buf.position() == ' '
|| *buf.position() == '\t'))
++buf.position();
}
static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns)
{
String tmp;
for (size_t i = 0; i < num_columns; ++i)
{
skipWhitespacesAndTabs(istr);
readCSVString(tmp, istr, settings);
skipWhitespacesAndTabs(istr);
skipDelimiter(istr, settings.delimiter, i + 1 == num_columns);
}
}
CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_)
: istr(istr_), header(header_), with_names(with_names_), format_settings(format_settings_)
{
const auto num_columns = header.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
column_idx_to_nullable_column_idx.resize(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = header.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
/// If input_format_null_as_default=1 we need ColumnNullable of type DataTypeNullable(nested_type)
/// to parse value as nullable before inserting it in corresponding column of not-nullable type.
/// Constructing temporary column for each row is slow, so we prepare it here
if (format_settings.csv.null_as_default && !column_info.type->isNullable() && column_info.type->canBeInsideNullable())
{
column_idx_to_nullable_column_idx[i] = nullable_columns.size();
nullable_types.emplace_back(std::make_shared<DataTypeNullable>(column_info.type));
nullable_columns.emplace_back(nullable_types.back()->createColumn());
}
}
}
/// Map an input file column to a table column, based on its name.
void CSVRowInputStream::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in CSV header: '" + column_name + "' " +
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_indexes_for_input_fields.emplace_back(column_index);
}
void CSVRowInputStream::readPrefix()
{
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
skipBOMIfExists(istr);
if (with_names)
{
/// This CSV file has a header row with column names. Depending on the
/// settings, use it or skip it.
if (format_settings.with_names_use_header)
{
/// Look at the file header to see which columns we have there.
/// The missing columns are filled with defaults.
read_columns.assign(header.columns(), false);
do
{
String column_name;
skipWhitespacesAndTabs(istr);
readCSVString(column_name, istr, format_settings.csv);
skipWhitespacesAndTabs(istr);
addInputColumn(column_name);
}
while (checkChar(format_settings.csv.delimiter, istr));
skipDelimiter(istr, format_settings.csv.delimiter, true);
for (size_t column = 0; column < read_columns.size(); column++)
{
if (!read_columns[column])
{
have_always_default_columns = true;
break;
}
}
return;
}
else
{
skipRow(istr, format_settings.csv, header.columns());
}
}
/// The default: map each column of the file to the column of the table with
/// the same index.
read_columns.assign(header.columns(), true);
column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
{
column_indexes_for_input_fields[i] = i;
}
}
/** If you change this function, don't forget to change its counterpart
* with extended error reporting: parseRowAndPrintDiagnosticInfo().
*/
bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
{
if (istr.eof())
return false;
updateDiagnosticInfo();
/// Track whether we have to fill any columns in this row with default
/// values. If not, we return an empty column mask to the caller, so that
/// it doesn't have to check it.
bool have_default_columns = have_always_default_columns;
const auto delimiter = format_settings.csv.delimiter;
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{
const auto & table_column = column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
if (table_column)
{
skipWhitespacesAndTabs(istr);
read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column],
is_last_file_column, *table_column);
if (!read_columns[*table_column])
have_default_columns = true;
skipWhitespacesAndTabs(istr);
}
else
{
/// We never read this column from the file, just skip it.
String tmp;
readCSVString(tmp, istr, format_settings.csv);
}
skipDelimiter(istr, delimiter, is_last_file_column);
}
if (have_default_columns)
{
for (size_t i = 0; i < read_columns.size(); i++)
{
if (!read_columns[i])
{
/// The column value for this row is going to be overwritten
/// with default by the caller, but the general assumption is
/// that the column size increases for each row, so we have
/// to insert something. Since we do not care about the exact
/// value, we do not have to use the default value specified by
/// the data type, and can just use IColumn::insertDefault().
columns[i]->insertDefault();
}
}
ext.read_columns = read_columns;
}
return true;
}
String CSVRowInputStream::getDiagnosticInfo()
{
if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
return {};
WriteBufferFromOwnString out;
MutableColumns columns = header.cloneEmptyColumns();
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
return out.str();
}
size_t max_length_of_column_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = header.safeGetByPosition(i).name.size();
size_t max_length_of_data_type_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
if (pos_of_prev_row)
{
istr.position() = pos_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
return out.str();
}
else
{
if (!pos_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return out.str();
}
istr.position() = pos_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
out << "\n";
return out.str();
}
/** gcc-7 generates wrong code with optimization level greater than 1.
* See tests: dbms/src/IO/tests/write_int.cpp
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
* Nevertheless, we don't need high optimization of this function.
*/
bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
const char delimiter = format_settings.csv.delimiter;
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{
if (file_column == 0 && istr.eof())
{
out << "<End of stream>\n";
return false;
}
if (column_indexes_for_input_fields[file_column].has_value())
{
const auto & table_column = *column_indexes_for_input_fields[file_column];
const auto & current_column_type = data_types[table_column];
const bool is_last_file_column =
file_column + 1 == column_indexes_for_input_fields.size();
const bool at_delimiter = !istr.eof() && *istr.position() == delimiter;
const bool at_last_column_line_end = is_last_file_column
&& (istr.eof() || *istr.position() == '\n' || *istr.position() == '\r');
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ')
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
if (format_settings.csv.empty_as_default
&& (at_delimiter || at_last_column_line_end))
{
columns[table_column]->insertDefault();
}
else
{
BufferBase::Position prev_position = istr.position();
BufferBase::Position curr_position = istr.position();
std::exception_ptr exception;
try
{
skipWhitespacesAndTabs(istr);
prev_position = istr.position();
readField(*columns[table_column], current_column_type, is_last_file_column, table_column);
curr_position = istr.position();
skipWhitespacesAndTabs(istr);
}
catch (...)
{
exception = std::current_exception();
}
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue()
&& *curr_position != '\n' && *curr_position != '\r'
&& *curr_position != delimiter)
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
String tmp;
readCSVString(tmp, istr, format_settings.csv);
}
/// Delimiters
if (file_column + 1 == column_indexes_for_input_fields.size())
{
if (istr.eof())
return false;
/// we support the extra delimiter at the end of the line
if (*istr.position() == delimiter)
{
++istr.position();
if (istr.eof())
break;
}
if (!istr.eof() && *istr.position() != '\n' && *istr.position() != '\r')
{
out << "ERROR: There is no line feed. ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n"
" It's like your file has more columns than expected.\n"
"And if your file have right number of columns, maybe it have unquoted string value with comma.\n";
return false;
}
skipEndOfLine(istr);
}
else
{
try
{
assertChar(delimiter, istr);
}
catch (const DB::Exception &)
{
if (*istr.position() == '\n' || *istr.position() == '\r')
{
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
" It's like your file has less columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped quotes in values.\n";
}
else
{
out << "ERROR: There is no delimiter (" << delimiter << "). ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
return true;
}
void CSVRowInputStream::syncAfterError()
{
skipToNextLineOrEOF(istr);
}
void CSVRowInputStream::updateDiagnosticInfo()
{
++row_num;
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
pos_of_prev_row = pos_of_current_row;
pos_of_current_row = istr.position();
}
bool CSVRowInputStream::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
{
const bool at_delimiter = !istr.eof() || *istr.position() == format_settings.csv.delimiter;
const bool at_last_column_line_end = is_last_file_column
&& (istr.eof() || *istr.position() == '\n' || *istr.position() == '\r');
if (format_settings.csv.empty_as_default
&& (at_delimiter || at_last_column_line_end))
{
/// Treat empty unquoted column value as default value, if
/// specified in the settings. Tuple columns might seem
/// problematic, because they are never quoted but still contain
/// commas, which might be also used as delimiters. However,
/// they do not contain empty unquoted fields, so this check
/// works for tuples as well.
return false;
}
else if (column_idx_to_nullable_column_idx[column_idx])
{
/// If value is null but type is not nullable then use default value instead.
const size_t nullable_idx = *column_idx_to_nullable_column_idx[column_idx];
auto & tmp_col = *nullable_columns[nullable_idx];
nullable_types[nullable_idx]->deserializeAsTextCSV(tmp_col, istr, format_settings);
Field value = tmp_col[0];
tmp_col.popBack(1); /// do not store copy of values in memory
if (value.isNull())
return false;
column.insert(value);
return true;
}
else
{
/// Read the column normally.
type->deserializeAsTextCSV(column, istr, format_settings);
return true;
}
}
void registerInputFormatCSV(FormatFactory & factory)
{
for (bool with_names : {false, true})
{
factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=](
ReadBuffer & buf,
const Block & sample,
const Context &,
UInt64 max_block_size,
UInt64 rows_portion_size,
FormatFactory::ReadCallback callback,
const FormatSettings & settings)
{
return std::make_shared<BlockInputStreamFromRowInputStream>(
std::make_shared<CSVRowInputStream>(buf, sample, with_names, settings),
sample, max_block_size, rows_portion_size, callback, settings);
});
}
}
}

View File

@ -1,83 +0,0 @@
#pragma once
#include <optional>
#include <unordered_map>
#include <Core/Block.h>
#include <Formats/IRowInputStream.h>
#include <Formats/FormatSettings.h>
namespace DB
{
class ReadBuffer;
/** A stream for inputting data in csv format.
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
*/
class CSVRowInputStream : public IRowInputStream
{
public:
/** with_names - in the first line the header with column names
*/
CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings_);
bool read(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
std::string getDiagnosticInfo() override;
private:
ReadBuffer & istr;
Block header;
bool with_names;
DataTypes data_types;
const FormatSettings format_settings;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
/// Maps indexes of columns in the input file to indexes of table columns
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
/// Tracks which colums we have read in a single read() call.
/// For columns that are never read, it is initialized to false when we
/// read the file header, and never changed afterwards.
/// For other columns, it is updated on each read() call.
std::vector<UInt8> read_columns;
/// Whether we have any columns that are not read from file at all,
/// and must be always initialized with defaults.
bool have_always_default_columns = false;
void addInputColumn(const String & column_name);
/// For convenient diagnostics in case of an error.
size_t row_num = 0;
/// How many bytes were read, not counting those that are still in the buffer.
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
char * pos_of_current_row = nullptr;
char * pos_of_prev_row = nullptr;
/// For setting input_format_null_as_default
DataTypes nullable_types;
MutableColumns nullable_columns;
OptionalIndexes column_idx_to_nullable_column_idx;
void updateDiagnosticInfo();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
};
}

View File

@ -47,6 +47,9 @@ static FormatSettings getInputFormatSetting(const Settings & settings)
format_settings.date_time_input_format = settings.date_time_input_format; format_settings.date_time_input_format = settings.date_time_input_format;
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num; format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio; format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
format_settings.template_settings.format = settings.format_schema;
format_settings.template_settings.row_format = settings.format_schema_rows;
format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter;
return format_settings; return format_settings;
} }
@ -63,6 +66,9 @@ static FormatSettings getOutputFormatSetting(const Settings & settings)
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows; format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.template_settings.format = settings.format_schema;
format_settings.template_settings.row_format = settings.format_schema_rows;
format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter;
format_settings.write_statistics = settings.output_format_write_statistics; format_settings.write_statistics = settings.output_format_write_statistics;
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
@ -220,8 +226,6 @@ void FormatFactory::registerOutputFormatProcessor(const String & name, OutputPro
void registerInputFormatNative(FormatFactory & factory); void registerInputFormatNative(FormatFactory & factory);
void registerOutputFormatNative(FormatFactory & factory); void registerOutputFormatNative(FormatFactory & factory);
void registerInputFormatTabSeparated(FormatFactory & factory);
void registerInputFormatCSV(FormatFactory & factory);
void registerInputFormatProcessorNative(FormatFactory & factory); void registerInputFormatProcessorNative(FormatFactory & factory);
void registerOutputFormatProcessorNative(FormatFactory & factory); void registerOutputFormatProcessorNative(FormatFactory & factory);
@ -242,6 +246,8 @@ void registerInputFormatProcessorORC(FormatFactory & factory);
void registerOutputFormatProcessorParquet(FormatFactory & factory); void registerOutputFormatProcessorParquet(FormatFactory & factory);
void registerInputFormatProcessorProtobuf(FormatFactory & factory); void registerInputFormatProcessorProtobuf(FormatFactory & factory);
void registerOutputFormatProcessorProtobuf(FormatFactory & factory); void registerOutputFormatProcessorProtobuf(FormatFactory & factory);
void registerInputFormatProcessorTemplate(FormatFactory & factory);
void registerOutputFormatProcessorTemplate(FormatFactory &factory);
/// Output only (presentational) formats. /// Output only (presentational) formats.
@ -267,8 +273,6 @@ FormatFactory::FormatFactory()
{ {
registerInputFormatNative(*this); registerInputFormatNative(*this);
registerOutputFormatNative(*this); registerOutputFormatNative(*this);
registerInputFormatTabSeparated(*this);
registerInputFormatCSV(*this);
registerOutputFormatProcessorJSONEachRowWithProgress(*this); registerOutputFormatProcessorJSONEachRowWithProgress(*this);
@ -292,6 +296,8 @@ FormatFactory::FormatFactory()
registerInputFormatProcessorORC(*this); registerInputFormatProcessorORC(*this);
registerInputFormatProcessorParquet(*this); registerInputFormatProcessorParquet(*this);
registerOutputFormatProcessorParquet(*this); registerOutputFormatProcessorParquet(*this);
registerInputFormatProcessorTemplate(*this);
registerOutputFormatProcessorTemplate(*this);
registerOutputFormatNull(*this); registerOutputFormatNull(*this);

View File

@ -50,6 +50,15 @@ struct FormatSettings
Values values; Values values;
struct Template
{
String format;
String row_format;
String row_between_delimiter;
};
Template template_settings;
bool skip_unknown_fields = false; bool skip_unknown_fields = false;
bool with_names_use_header = false; bool with_names_use_header = false;
bool write_statistics = true; bool write_statistics = true;

View File

@ -0,0 +1,217 @@
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/verbosePrintString.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/Operators.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INVALID_TEMPLATE_FORMAT;
}
ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name)
{
try
{
parse(format_string, idx_by_name);
}
catch (DB::Exception & e)
{
if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT)
throwInvalidFormat(e.message(), columnsCount());
else
throw;
}
}
void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name)
{
enum ParserState
{
Delimiter,
Column,
Format
};
const char * pos = format_string.c_str();
const char * end = format_string.c_str() + format_string.size();
const char * token_begin = pos;
ParserState state = Delimiter;
delimiters.emplace_back();
for (; *pos; ++pos)
{
switch (state)
{
case Delimiter:
if (*pos == '$')
{
delimiters.back().append(token_begin, pos - token_begin);
++pos;
if (*pos == '{')
{
token_begin = pos + 1;
state = Column;
}
else if (*pos == '$')
{
token_begin = pos;
}
else
throwInvalidFormat("at pos " + std::to_string(pos - format_string.c_str()) +
": expected '{' or '$' after '$'", columnsCount());
}
break;
case Column:
column_names.emplace_back();
pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back());
if (*pos == ':')
state = Format;
else if (*pos == '}')
{
formats.push_back(ColumnFormat::None);
delimiters.emplace_back();
state = Delimiter;
}
else
throwInvalidFormat("Expected ':' or '}' after column name: \"" + column_names.back() + "\"", columnsCount());
token_begin = pos + 1;
format_idx_to_column_idx.emplace_back(idx_by_name(column_names.back()));
break;
case Format:
if (*pos == '}')
{
formats.push_back(stringToFormat(String(token_begin, pos - token_begin)));
token_begin = pos + 1;
delimiters.emplace_back();
state = Delimiter;
}
}
}
if (state != Delimiter)
throwInvalidFormat("Unbalanced parentheses", columnsCount());
delimiters.back().append(token_begin, pos - token_begin);
}
ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) const
{
if (col_format.empty())
return ColumnFormat::None;
else if (col_format == "None")
return ColumnFormat::None;
else if (col_format == "Escaped")
return ColumnFormat::Escaped;
else if (col_format == "Quoted")
return ColumnFormat::Quoted;
else if (col_format == "CSV")
return ColumnFormat::Csv;
else if (col_format == "JSON")
return ColumnFormat::Json;
else if (col_format == "XML")
return ColumnFormat::Xml;
else if (col_format == "Raw")
return ColumnFormat::Raw;
else
throwInvalidFormat("Unknown field format " + col_format, columnsCount());
}
size_t ParsedTemplateFormatString::columnsCount() const
{
return format_idx_to_column_idx.size();
}
String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format)
{
switch (format)
{
case ColumnFormat::None:
return "None";
case ColumnFormat::Escaped:
return "Escaped";
case ColumnFormat::Quoted:
return "Quoted";
case ColumnFormat::Csv:
return "CSV";
case ColumnFormat::Json:
return "Json";
case ColumnFormat::Xml:
return "Xml";
case ColumnFormat::Raw:
return "Raw";
}
__builtin_unreachable();
}
const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s)
{
s.clear();
if (!size)
return pos;
ReadBufferFromMemory buf{pos, size};
if (*pos == '"')
readDoubleQuotedStringWithSQLStyle(s, buf);
else if (*pos == '`')
readBackQuotedStringWithSQLStyle(s, buf);
else if (isWordCharASCII(*pos))
{
size_t name_size = 1;
while (name_size < size && isWordCharASCII(*(pos + name_size)))
++name_size;
s = String{pos, name_size};
return pos + name_size;
}
return pos + buf.count();
}
String ParsedTemplateFormatString::dump() const
{
WriteBufferFromOwnString res;
res << "Delimiter " << 0 << ": ";
verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res);
size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size());
for (size_t i = 0; i < num_columns; ++i)
{
res << "\nColumn " << i << ": \"";
if (column_names.size() <= i)
res << "<ERROR>";
else if (column_names[i].empty())
res << "<SKIPPED>";
else
res << column_names[i];
res << "\" (mapped to table column ";
if (format_idx_to_column_idx.size() <= i)
res << "<ERROR>";
else if (!format_idx_to_column_idx[i])
res << "<SKIPPED>";
else
res << *format_idx_to_column_idx[i];
res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : "<ERROR>");
res << "\nDelimiter " << i + 1 << ": ";
if (delimiters.size() <= i + 1)
res << "<ERROR>";
else
verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res);
}
return res.str();
}
void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const
{
throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) +
")" + ". Parsed format string:\n" + dump() + "\n",
ErrorCodes::INVALID_TEMPLATE_FORMAT);
}
}

View File

@ -0,0 +1,51 @@
#pragma once
#include <Core/Types.h>
#include <functional>
#include <optional>
namespace DB
{
struct ParsedTemplateFormatString
{
enum class ColumnFormat
{
None,
Escaped,
Quoted,
Csv,
Json,
Xml,
Raw
};
/// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2"
/// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size()
/// If format_idx_to_column_idx[i] has no value, then TemplateRowInputFormat will skip i-th column.
std::vector<String> delimiters;
std::vector<ColumnFormat> formats;
std::vector<std::optional<size_t>> format_idx_to_column_idx;
/// For diagnostic info
Strings column_names;
typedef std::function<std::optional<size_t>(const String &)> ColumnIdxGetter;
ParsedTemplateFormatString() = default;
ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name);
void parse(const String & format_string, const ColumnIdxGetter & idx_by_name);
ColumnFormat stringToFormat(const String & format) const;
static String formatToString(ColumnFormat format);
static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s);
size_t columnsCount() const;
String dump() const;
[[noreturn]] void throwInvalidFormat(const String & message, size_t column) const;
};
}

View File

@ -1,504 +0,0 @@
#include <string>
#include <Core/Defines.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Formats/TabSeparatedRowInputStream.h>
#include <Formats/verbosePrintString.h>
#include <Formats/FormatFactory.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int LOGICAL_ERROR;
}
static void skipTSVRow(ReadBuffer & istr, const size_t num_columns)
{
NullSink null_sink;
for (size_t i = 0; i < num_columns; ++i)
{
readEscapedStringInto(null_sink, istr);
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
}
}
/** Check for a common error case - usage of Windows line feed.
*/
static void checkForCarriageReturn(ReadBuffer & istr)
{
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
" You must transform your file to Unix format."
"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
ErrorCodes::INCORRECT_DATA);
}
TabSeparatedRowInputStream::TabSeparatedRowInputStream(
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: istr(istr_), header(header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
{
const auto num_columns = header.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = header.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
column_indexes_for_input_fields.reserve(num_columns);
read_columns.assign(num_columns, false);
}
void TabSeparatedRowInputStream::setupAllColumnsByTableSchema()
{
read_columns.assign(header.columns(), true);
column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
column_indexes_for_input_fields[i] = i;
}
void TabSeparatedRowInputStream::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in TSV header: '" + column_name + "' " +
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_indexes_for_input_fields.emplace_back(column_index);
}
void TabSeparatedRowInputStream::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension)
{
/// It is safe to memorize this on the first run - the format guarantees this does not change
if (unlikely(row_num == 1))
{
columns_to_fill_with_default_values.clear();
for (size_t index = 0; index < read_columns.size(); ++index)
if (read_columns[index] == 0)
columns_to_fill_with_default_values.push_back(index);
}
for (const auto column_index : columns_to_fill_with_default_values)
data_types[column_index]->insertDefaultInto(*columns[column_index]);
row_read_extension.read_columns = read_columns;
}
void TabSeparatedRowInputStream::readPrefix()
{
if (with_names || with_types)
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(istr);
}
if (with_names)
{
if (format_settings.with_names_use_header)
{
String column_name;
do
{
readEscapedString(column_name, istr);
addInputColumn(column_name);
}
while (checkChar('\t', istr));
if (!istr.eof())
{
checkForCarriageReturn(istr);
assertChar('\n', istr);
}
}
else
{
setupAllColumnsByTableSchema();
skipTSVRow(istr, column_indexes_for_input_fields.size());
}
}
else
setupAllColumnsByTableSchema();
if (with_types)
{
skipTSVRow(istr, column_indexes_for_input_fields.size());
}
}
bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
{
if (istr.eof())
return false;
updateDiagnosticInfo();
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
const auto & column_index = column_indexes_for_input_fields[input_position];
if (column_index)
{
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], istr, format_settings);
}
else
{
NullSink null_sink;
readEscapedStringInto(null_sink, istr);
}
/// skip separators
if (input_position + 1 < column_indexes_for_input_fields.size())
{
assertChar('\t', istr);
}
else if (!istr.eof())
{
if (unlikely(row_num == 1))
checkForCarriageReturn(istr);
assertChar('\n', istr);
}
}
fillUnreadColumnsWithDefaults(columns, ext);
return true;
}
String TabSeparatedRowInputStream::getDiagnosticInfo()
{
if (istr.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
return {};
WriteBufferFromOwnString out;
MutableColumns columns = header.cloneEmptyColumns();
/// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer.
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
return out.str();
}
size_t max_length_of_column_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = header.safeGetByPosition(i).name.size();
size_t max_length_of_data_type_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current line and parse all over again. But now we derive detailed information.
if (pos_of_prev_row)
{
istr.position() = pos_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
return out.str();
}
else
{
if (!pos_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return out.str();
}
istr.position() = pos_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
out << "\n";
return out.str();
}
/** gcc-7 generates wrong code with optimization level greater than 1.
* See tests: dbms/src/IO/tests/write_int.cpp
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
* Nevertheless, we don't need high optimization of this function.
*/
bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
if (input_position == 0 && istr.eof())
{
out << "<End of stream>\n";
return false;
}
if (column_indexes_for_input_fields[input_position].has_value())
{
const auto & column_index = *column_indexes_for_input_fields[input_position];
const auto & current_column_type = data_types[column_index];
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
auto prev_position = istr.position();
std::exception_ptr exception;
try
{
current_column_type->deserializeAsTextEscaped(*columns[column_index], istr, format_settings);
}
catch (...)
{
exception = std::current_exception();
}
auto curr_position = istr.position();
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\t')
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
NullSink null_sink;
readEscapedStringInto(null_sink, istr);
}
/// Delimiters
if (input_position + 1 == column_indexes_for_input_fields.size())
{
if (!istr.eof())
{
try
{
assertChar('\n', istr);
}
catch (const DB::Exception &)
{
if (*istr.position() == '\t')
{
out << "ERROR: Tab found where line feed is expected."
" It's like your file has more columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
}
else if (*istr.position() == '\r')
{
out << "ERROR: Carriage return found where line feed is expected."
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
}
else
{
out << "ERROR: There is no line feed. ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
else
{
try
{
assertChar('\t', istr);
}
catch (const DB::Exception &)
{
if (*istr.position() == '\n')
{
out << "ERROR: Line feed found where tab is expected."
" It's like your file has less columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
}
else if (*istr.position() == '\r')
{
out << "ERROR: Carriage return found where tab is expected.\n";
}
else
{
out << "ERROR: There is no tab. ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
return true;
}
void TabSeparatedRowInputStream::syncAfterError()
{
skipToUnescapedNextLineOrEOF(istr);
}
void TabSeparatedRowInputStream::updateDiagnosticInfo()
{
++row_num;
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
pos_of_prev_row = pos_of_current_row;
pos_of_current_row = istr.position();
}
void registerInputFormatTabSeparated(FormatFactory & factory)
{
for (auto name : {"TabSeparated", "TSV"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
const Context &,
UInt64 max_block_size,
UInt64 rows_portion_size,
FormatFactory::ReadCallback callback,
const FormatSettings & settings)
{
return std::make_shared<BlockInputStreamFromRowInputStream>(
std::make_shared<TabSeparatedRowInputStream>(buf, sample, false, false, settings),
sample, max_block_size, rows_portion_size, callback, settings);
});
}
for (auto name : {"TabSeparatedWithNames", "TSVWithNames"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
const Context &,
UInt64 max_block_size,
UInt64 rows_portion_size,
FormatFactory::ReadCallback callback,
const FormatSettings & settings)
{
return std::make_shared<BlockInputStreamFromRowInputStream>(
std::make_shared<TabSeparatedRowInputStream>(buf, sample, true, false, settings),
sample, max_block_size, rows_portion_size, callback, settings);
});
}
for (auto name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
const Context &,
UInt64 max_block_size,
UInt64 rows_portion_size,
FormatFactory::ReadCallback callback,
const FormatSettings & settings)
{
return std::make_shared<BlockInputStreamFromRowInputStream>(
std::make_shared<TabSeparatedRowInputStream>(buf, sample, true, true, settings),
sample, max_block_size, rows_portion_size, callback, settings);
});
}
}
}

View File

@ -1,73 +0,0 @@
#pragma once
#include <optional>
#include <unordered_map>
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <Formats/IRowInputStream.h>
namespace DB
{
class ReadBuffer;
/** A stream to input data in tsv format.
*/
class TabSeparatedRowInputStream : public IRowInputStream
{
public:
/** with_names - the first line is the header with the names of the columns
* with_types - on the next line header with type names
*/
TabSeparatedRowInputStream(
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
bool read(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
std::string getDiagnosticInfo() override;
private:
ReadBuffer & istr;
Block header;
bool with_names;
bool with_types;
const FormatSettings format_settings;
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
std::vector<UInt8> read_columns;
std::vector<size_t> columns_to_fill_with_default_values;
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
/// For convenient diagnostics in case of an error.
size_t row_num = 0;
/// How many bytes were read, not counting those still in the buffer.
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
char * pos_of_current_row = nullptr;
char * pos_of_prev_row = nullptr;
void updateDiagnosticInfo();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
};
}

View File

@ -2,6 +2,3 @@ set(SRCS )
add_executable (tab_separated_streams tab_separated_streams.cpp ${SRCS}) add_executable (tab_separated_streams tab_separated_streams.cpp ${SRCS})
target_link_libraries (tab_separated_streams PRIVATE dbms) target_link_libraries (tab_separated_streams PRIVATE dbms)
add_executable (block_row_transforms block_row_transforms.cpp ${SRCS})
target_link_libraries (block_row_transforms PRIVATE dbms)

View File

@ -1,57 +0,0 @@
#include <string>
#include <iostream>
#include <fstream>
#include <Core/Block.h>
#include <Core/ColumnWithTypeAndName.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/WriteBufferFromFile.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Formats/TabSeparatedRowInputStream.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
#include <DataStreams/copyData.h>
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
#include <Processors/Formats/OutputStreamToOutputFormat.h>
int main(int, char **)
try
{
using namespace DB;
Block sample;
ColumnWithTypeAndName col1;
col1.name = "col1";
col1.type = std::make_shared<DataTypeUInt64>();
col1.column = col1.type->createColumn();
sample.insert(col1);
ColumnWithTypeAndName col2;
col2.name = "col2";
col2.type = std::make_shared<DataTypeString>();
col2.column = col2.type->createColumn();
sample.insert(col2);
ReadBufferFromFile in_buf("test_in");
WriteBufferFromFile out_buf("test_out");
FormatSettings format_settings;
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, []{}, format_settings));
copyData(block_input, *block_output);
}
catch (const DB::Exception & e)
{
std::cerr << e.what() << ", " << e.displayText() << std::endl;
return 1;
}

View File

@ -9,12 +9,12 @@
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <Formats/TabSeparatedRowInputStream.h> #include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
#include <DataStreams/copyData.h> #include <DataStreams/copyData.h>
#include <Processors/Formats/OutputStreamToOutputFormat.h> #include <Processors/Formats/OutputStreamToOutputFormat.h>
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h> #include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
#include <Processors/Formats/InputStreamFromInputFormat.h>
using namespace DB; using namespace DB;
@ -39,13 +39,15 @@ try
FormatSettings format_settings; FormatSettings format_settings;
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings); RowInputFormatParams params{DEFAULT_INSERT_BLOCK_SIZE, 0, 0, 0, []{}};
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
InputFormatPtr input_format = std::make_shared<TabSeparatedRowInputFormat>(sample, in_buf, params, false, false, format_settings);
BlockInputStreamPtr block_input = std::make_shared<InputStreamFromInputFormat>(std::move(input_format));
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>( BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(
std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, [] {}, format_settings)); std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, [] {}, format_settings));
copyData(block_input, *block_output); copyData(*block_input, *block_output);
return 0; return 0;
} }
catch (...) catch (...)

View File

@ -332,7 +332,7 @@ UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out)
} }
} }
if (items == 0 && args.items_count != 0) if (items == 0)
{ {
size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out); size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out);
out += l; out += l;

View File

@ -336,7 +336,9 @@ void FunctionArrayEnumerateRankedExtended<Derived>::executeMethodImpl(
/// Skipping offsets if no data in this array /// Skipping offsets if no data in this array
if (prev_off == off) if (prev_off == off)
{ {
want_clear = true;
if (depth_to_look > 2)
want_clear = true;
if (depth_to_look >= 2) if (depth_to_look >= 2)
{ {

View File

@ -0,0 +1,316 @@
#include <IO/PeekableReadBuffer.h>
namespace DB
{
PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/,
size_t unread_limit_ /* = default_limit*/)
: BufferWithOwnMemory(start_size_), sub_buf(sub_buf_), unread_limit(unread_limit_)
{
padded &= sub_buf.isPadded();
/// Read from sub-buffer
Buffer & sub_working = sub_buf.buffer();
BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset());
checkStateCorrect();
}
bool PeekableReadBuffer::peekNext()
{
checkStateCorrect();
size_t bytes_read = 0;
Position copy_from = pos;
size_t bytes_to_copy = sub_buf.available();
if (useSubbufferOnly())
{
/// Don't have to copy all data from sub-buffer if there is no data in own memory (checkpoint and pos are in sub-buffer)
if (checkpoint)
copy_from = checkpoint;
bytes_read = copy_from - sub_buf.buffer().begin();
bytes_to_copy = sub_buf.buffer().end() - copy_from; /// sub_buf.available();
if (!bytes_to_copy)
{
bytes += bytes_read;
sub_buf.position() = copy_from;
/// Both checkpoint and pos are at the end of sub-buffer. Just load next part of data.
bool res = sub_buf.next();
BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset());
if (checkpoint)
checkpoint = pos;
checkStateCorrect();
return res;
}
}
/// May throw an exception
resizeOwnMemoryIfNecessary(bytes_to_copy);
if (useSubbufferOnly())
{
bytes += bytes_read;
sub_buf.position() = copy_from;
}
/// Save unread data from sub-buffer to own memory
memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy);
/// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary
/// Otherwise, checkpoint now at the beginning of own memory
if (checkpoint && useSubbufferOnly())
{
checkpoint = memory.data();
checkpoint_in_own_memory = true;
}
if (currentlyReadFromOwnMemory())
{
/// Update buffer size
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, offset());
}
else
{
/// Switch to reading from own memory
size_t pos_offset = peeked_size + this->offset();
if (useSubbufferOnly())
{
if (checkpoint)
pos_offset = bytes_to_copy;
else
pos_offset = 0;
}
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
}
peeked_size += bytes_to_copy;
sub_buf.position() += bytes_to_copy;
checkStateCorrect();
return sub_buf.next();
}
void PeekableReadBuffer::setCheckpoint()
{
checkStateCorrect();
#ifndef NDEBUG
if (checkpoint)
throw DB::Exception("Does not support recursive checkpoints.", ErrorCodes::LOGICAL_ERROR);
#endif
checkpoint_in_own_memory = currentlyReadFromOwnMemory();
if (!checkpoint_in_own_memory)
{
/// Don't need to store unread data anymore
peeked_size = 0;
}
checkpoint = pos;
checkStateCorrect();
}
void PeekableReadBuffer::dropCheckpoint()
{
checkStateCorrect();
#ifndef NDEBUG
if (!checkpoint)
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
#endif
if (!currentlyReadFromOwnMemory())
{
/// Don't need to store unread data anymore
peeked_size = 0;
}
checkpoint = nullptr;
checkpoint_in_own_memory = false;
checkStateCorrect();
}
void PeekableReadBuffer::rollbackToCheckpoint()
{
checkStateCorrect();
if (!checkpoint)
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
pos = checkpoint;
else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory
BufferBase::set(memory.data(), peeked_size, checkpoint - memory.data());
checkStateCorrect();
}
bool PeekableReadBuffer::nextImpl()
{
/// FIXME wrong bytes count because it can read the same data again after rollbackToCheckpoint()
/// However, changing bytes count on every call of next() (even after rollback) allows to determine if some pointers were invalidated.
checkStateCorrect();
bool res;
if (!checkpoint)
{
if (!useSubbufferOnly())
{
/// All copied data have been read from own memory, continue reading from sub_buf
peeked_size = 0;
res = sub_buf.hasPendingData() || sub_buf.next();
}
else
{
/// Load next data to sub_buf
sub_buf.position() = pos;
res = sub_buf.next();
}
Buffer & sub_working = sub_buf.buffer();
/// Switch to reading from sub_buf (or just update it if already switched)
BufferBase::set(sub_working.begin(), sub_working.size(), 0);
}
else
{
if (currentlyReadFromOwnMemory())
res = sub_buf.hasPendingData() || sub_buf.next();
else
res = peekNext();
Buffer & sub_working = sub_buf.buffer();
BufferBase::set(sub_working.begin(), sub_working.size(), 0);
}
checkStateCorrect();
return res;
}
bool PeekableReadBuffer::useSubbufferOnly() const
{
return !peeked_size;
}
void PeekableReadBuffer::checkStateCorrect() const
{
#ifndef NDEBUG
if (checkpoint)
{
if (checkpointInOwnMemory())
{
if (!peeked_size)
throw DB::Exception("Checkpoint in empty own buffer", ErrorCodes::LOGICAL_ERROR);
if (currentlyReadFromOwnMemory() && pos < checkpoint)
throw DB::Exception("Current position in own buffer before checkpoint in own buffer", ErrorCodes::LOGICAL_ERROR);
}
else
{
if (peeked_size)
throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR);
if (currentlyReadFromOwnMemory())
throw DB::Exception("Current position in own buffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR);
if (pos < checkpoint)
throw DB::Exception("Current position in subbuffer before checkpoint in subbuffer", ErrorCodes::LOGICAL_ERROR);
}
}
else
{
if (!currentlyReadFromOwnMemory() && peeked_size)
throw DB::Exception("Own buffer is not empty", ErrorCodes::LOGICAL_ERROR);
}
if (currentlyReadFromOwnMemory() && !peeked_size)
throw DB::Exception("Pos in empty own buffer", ErrorCodes::LOGICAL_ERROR);
if (unread_limit < memory.size())
throw DB::Exception("Size limit exceed", ErrorCodes::LOGICAL_ERROR);
#endif
}
size_t PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append)
{
checkStateCorrect();
bool needUpdateCheckpoint = checkpointInOwnMemory();
bool needUpdatePos = currentlyReadFromOwnMemory();
size_t offset = 0;
if (needUpdateCheckpoint)
offset = checkpoint - memory.data();
else if (needUpdatePos)
offset = this->offset();
size_t new_size = peeked_size + bytes_to_append;
if (memory.size() < new_size)
{
if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size())
{
/// Move unread data to the beginning of own memory instead of resize own memory
peeked_size -= offset;
memmove(memory.data(), memory.data() + offset, peeked_size);
bytes += offset;
if (needUpdateCheckpoint)
checkpoint -= offset;
if (needUpdatePos)
pos -= offset;
checkStateCorrect();
return 0;
}
else
{
if (unread_limit < new_size)
throw DB::Exception("PeekableReadBuffer: Memory limit exceed", ErrorCodes::MEMORY_LIMIT_EXCEEDED);
size_t pos_offset = pos - memory.data();
size_t new_size_amortized = memory.size() * 2;
if (new_size_amortized < new_size)
new_size_amortized = new_size;
else if (unread_limit < new_size_amortized)
new_size_amortized = unread_limit;
memory.resize(new_size_amortized);
if (needUpdateCheckpoint)
checkpoint = memory.data() + offset;
if (needUpdatePos)
{
BufferBase::set(memory.data(), peeked_size, pos_offset);
}
}
}
checkStateCorrect();
return offset;
}
PeekableReadBuffer::~PeekableReadBuffer()
{
if (!currentlyReadFromOwnMemory())
sub_buf.position() = pos;
}
std::shared_ptr<BufferWithOwnMemory<ReadBuffer>> PeekableReadBuffer::takeUnreadData()
{
checkStateCorrect();
if (!currentlyReadFromOwnMemory())
return std::make_shared<BufferWithOwnMemory<ReadBuffer>>(0);
size_t unread_size = memory.data() + peeked_size - pos;
auto unread = std::make_shared<BufferWithOwnMemory<ReadBuffer>>(unread_size);
memcpy(unread->buffer().begin(), pos, unread_size);
unread->BufferBase::set(unread->buffer().begin(), unread_size, 0);
peeked_size = 0;
checkpoint = nullptr;
checkpoint_in_own_memory = false;
BufferBase::set(sub_buf.buffer().begin(), sub_buf.buffer().size(), sub_buf.offset());
checkStateCorrect();
return unread;
}
bool PeekableReadBuffer::currentlyReadFromOwnMemory() const
{
return working_buffer.begin() != sub_buf.buffer().begin();
}
bool PeekableReadBuffer::checkpointInOwnMemory() const
{
return checkpoint_in_own_memory;
}
void PeekableReadBuffer::assertCanBeDestructed() const
{
if (peeked_size && pos != memory.data() + peeked_size)
throw DB::Exception("There are data, which were extracted from sub-buffer, but not from peekable buffer. "
"Cannot destruct peekable buffer correctly because tha data will be lost."
"Most likely it's a bug.", ErrorCodes::LOGICAL_ERROR);
}
}

View File

@ -0,0 +1,96 @@
#pragma once
#include <IO/ReadBuffer.h>
#include <IO/BufferWithOwnMemory.h>
namespace DB
{
namespace ErrorCodes
{
extern const int MEMORY_LIMIT_EXCEEDED;
extern const int LOGICAL_ERROR;
}
/// Allows to peek next part of data from sub-buffer without extracting it.
/// Also allows to set checkpoint at some position in stream and come back to this position later,
/// even if next() was called.
/// Sub-buffer should not be accessed directly during the lifelime of peekable buffer.
/// If position() of peekable buffer is explicitly set to some position before checkpoint
/// (e.g. by istr.position() = prev_pos), behavior is undefined.
class PeekableReadBuffer : public BufferWithOwnMemory<ReadBuffer>
{
friend class PeekableReadBufferCheckpoint;
public:
explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE,
size_t unread_limit_ = 16 * DBMS_DEFAULT_BUFFER_SIZE);
/// Use takeUnreadData() to extract unread data before destruct object
~PeekableReadBuffer() override;
/// Saves unread data to own memory, so it will be possible to read it later. Loads next data to sub-buffer.
/// Doesn't change checkpoint and position in stream,
/// but all pointers (such as this->buffer().end() and this->position()) may be invalidated
/// @returns false in case of EOF in sub-buffer, otherwise returns true
bool peekNext();
Buffer & lastPeeked() { return sub_buf.buffer(); }
/// Sets checkpoint at current position
void setCheckpoint();
/// Forget checkpoint and all data between checkpoint and position
void dropCheckpoint();
/// Sets position at checkpoint.
/// All pointers (such as this->buffer().end()) may be invalidated
void rollbackToCheckpoint();
/// If position is in own memory, returns buffer with data, which were extracted from sub-buffer,
/// but not from this buffer, so the data will not be lost after destruction of this buffer.
/// If position is in sub-buffer, returns empty buffer.
std::shared_ptr<BufferWithOwnMemory<ReadBuffer>> takeUnreadData();
void assertCanBeDestructed() const;
private:
bool nextImpl() override;
inline bool useSubbufferOnly() const;
inline bool currentlyReadFromOwnMemory() const;
inline bool checkpointInOwnMemory() const;
void checkStateCorrect() const;
/// Makes possible to append `bytes_to_append` bytes to data in own memory.
/// Updates all invalidated pointers and sizes.
/// @returns new offset of unread data in own memory
size_t resizeOwnMemoryIfNecessary(size_t bytes_to_append);
ReadBuffer & sub_buf;
const size_t unread_limit;
size_t peeked_size = 0;
Position checkpoint = nullptr;
bool checkpoint_in_own_memory = false;
};
class PeekableReadBufferCheckpoint : boost::noncopyable
{
PeekableReadBuffer & buf;
bool auto_rollback;
public:
explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_, bool auto_rollback_ = false)
: buf(buf_), auto_rollback(auto_rollback_) { buf.setCheckpoint(); }
~PeekableReadBufferCheckpoint()
{
if (!buf.checkpoint)
return;
if (auto_rollback)
buf.rollbackToCheckpoint();
buf.dropCheckpoint();
}
};
}

View File

@ -0,0 +1,131 @@
#include <gtest/gtest.h>
#include <Core/Types.h>
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ConcatReadBuffer.h>
#include <IO/PeekableReadBuffer.h>
void readAndAssert(DB::ReadBuffer & buf, const char * str)
{
size_t n = strlen(str);
char tmp[n];
buf.readStrict(tmp, n);
ASSERT_EQ(strncmp(tmp, str, n), 0);
}
void assertAvailable(DB::ReadBuffer & buf, const char * str)
{
size_t n = strlen(str);
ASSERT_EQ(buf.available(), n);
ASSERT_EQ(strncmp(buf.position(), str, n), 0);
}
TEST(PeekableReadBuffer, CheckpointsWorkCorrectly)
try
{
std::string s1 = "0123456789";
std::string s2 = "qwertyuiop";
std::string s3 = "asdfghjkl;";
std::string s4 = "zxcvbnm,./";
DB::ReadBufferFromString b1(s1);
DB::ReadBufferFromString b2(s2);
DB::ReadBufferFromString b3(s3);
DB::ReadBufferFromString b4(s4);
DB::ConcatReadBuffer concat({&b1, &b2, &b3, &b4});
DB::PeekableReadBuffer peekable(concat, 0, 16);
ASSERT_TRUE(!peekable.eof());
assertAvailable(peekable, "0123456789");
{
DB::PeekableReadBufferCheckpoint checkpoint{peekable};
readAndAssert(peekable, "01234");
}
bool exception = false;
try
{
peekable.rollbackToCheckpoint();
}
catch (DB::Exception & e)
{
if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
throw;
exception = true;
}
ASSERT_TRUE(exception);
assertAvailable(peekable, "56789");
readAndAssert(peekable, "56");
peekable.setCheckpoint();
readAndAssert(peekable, "789qwertyu");
peekable.rollbackToCheckpoint();
peekable.dropCheckpoint();
assertAvailable(peekable, "789");
peekable.peekNext();
assertAvailable(peekable, "789qwertyuiop");
ASSERT_EQ(peekable.lastPeeked().size(), 10);
ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0);
exception = false;
try
{
DB::PeekableReadBufferCheckpoint checkpoint{peekable, true};
peekable.ignore(30);
}
catch (DB::Exception & e)
{
if (e.code() != DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED)
throw;
exception = true;
}
ASSERT_TRUE(exception);
assertAvailable(peekable, "789qwertyuiop");
ASSERT_EQ(peekable.lastPeeked().size(), 10);
ASSERT_EQ(strncmp(peekable.lastPeeked().begin(), "asdfghjkl;", 10), 0);
readAndAssert(peekable, "789qwertyu");
peekable.setCheckpoint();
readAndAssert(peekable, "iopasdfghj");
assertAvailable(peekable, "kl;");
peekable.dropCheckpoint();
peekable.setCheckpoint();
readAndAssert(peekable, "kl;zxcvbnm,./");
ASSERT_TRUE(peekable.eof());
ASSERT_TRUE(peekable.eof());
ASSERT_TRUE(peekable.eof());
peekable.rollbackToCheckpoint();
readAndAssert(peekable, "kl;zxcvbnm");
peekable.dropCheckpoint();
exception = false;
try
{
peekable.assertCanBeDestructed();
}
catch (DB::Exception & e)
{
if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
throw;
exception = true;
}
ASSERT_TRUE(exception);
auto buf_ptr = peekable.takeUnreadData();
ASSERT_TRUE(peekable.eof());
ASSERT_TRUE(peekable.eof());
ASSERT_TRUE(peekable.eof());
readAndAssert(*buf_ptr, ",./");
ASSERT_TRUE(buf_ptr->eof());
peekable.assertCanBeDestructed();
}
catch (const DB::Exception & e)
{
std::cerr << e.what() << ", " << e.displayText() << std::endl;
throw;
}

View File

@ -1,13 +1,18 @@
#include <Interpreters/AnalyzedJoin.h> #include <Interpreters/AnalyzedJoin.h>
#include <Interpreters/DatabaseAndTableWithAlias.h> #include <Interpreters/DatabaseAndTableWithAlias.h>
#include <Interpreters/InterpreterSelectWithUnionQuery.h> #include <Interpreters/InterpreterSelectWithUnionQuery.h>
#include <Interpreters/Join.h>
#include <Parsers/ASTExpressionList.h> #include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTTablesInSelectQuery.h> #include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ASTSelectQuery.h> #include <Parsers/ASTSelectQuery.h>
#include <Core/Block.h>
#include <Storages/IStorage.h> #include <Storages/IStorage.h>
#include <DataTypes/DataTypeNullable.h>
namespace DB namespace DB
{ {
@ -26,7 +31,6 @@ void AnalyzedJoin::addUsingKey(const ASTPtr & ast)
void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast) void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
{ {
with_using = false;
key_names_left.push_back(left_table_ast->getColumnName()); key_names_left.push_back(left_table_ast->getColumnName());
key_names_right.push_back(right_table_ast->getAliasOrColumnName()); key_names_right.push_back(right_table_ast->getAliasOrColumnName());
@ -37,7 +41,7 @@ void AnalyzedJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
/// @return how many times right key appears in ON section. /// @return how many times right key appears in ON section.
size_t AnalyzedJoin::rightKeyInclusion(const String & name) const size_t AnalyzedJoin::rightKeyInclusion(const String & name) const
{ {
if (with_using) if (hasUsing())
return 0; return 0;
size_t count = 0; size_t count = 0;
@ -101,6 +105,120 @@ std::unordered_map<String, String> AnalyzedJoin::getOriginalColumnsMap(const Nam
return out; return out;
} }
ASTPtr AnalyzedJoin::leftKeysList() const
{
ASTPtr keys_list = std::make_shared<ASTExpressionList>();
keys_list->children = key_asts_left;
return keys_list;
}
ASTPtr AnalyzedJoin::rightKeysList() const
{
ASTPtr keys_list = std::make_shared<ASTExpressionList>();
if (hasOn())
keys_list->children = key_asts_right;
return keys_list;
}
Names AnalyzedJoin::requiredJoinedNames() const
{
NameSet required_columns_set(key_names_right.begin(), key_names_right.end());
for (const auto & joined_column : columns_added_by_join)
required_columns_set.insert(joined_column.name);
return Names(required_columns_set.begin(), required_columns_set.end());
}
void AnalyzedJoin::appendRequiredColumns(const Block & sample, NameSet & required_columns) const
{
for (auto & column : key_names_right)
if (!sample.has(column))
required_columns.insert(column);
for (auto & column : columns_added_by_join)
if (!sample.has(column.name))
required_columns.insert(column.name);
}
void AnalyzedJoin::addJoinedColumn(const NameAndTypePair & joined_column)
{
if (join_use_nulls && isLeftOrFull(table_join.kind))
{
auto type = joined_column.type->canBeInsideNullable() ? makeNullable(joined_column.type) : joined_column.type;
columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, std::move(type)));
}
else
columns_added_by_join.push_back(joined_column);
}
void AnalyzedJoin::addJoinedColumnsAndCorrectNullability(Block & sample_block) const
{
bool right_or_full_join = isRightOrFull(table_join.kind);
bool left_or_full_join = isLeftOrFull(table_join.kind);
for (auto & col : sample_block)
{
/// Materialize column.
/// Column is not empty if it is constant, but after Join all constants will be materialized.
/// So, we need remove constants from header.
if (col.column)
col.column = nullptr;
bool make_nullable = join_use_nulls && right_or_full_join;
if (make_nullable && col.type->canBeInsideNullable())
col.type = makeNullable(col.type);
}
for (const auto & col : columns_added_by_join)
{
auto res_type = col.type;
bool make_nullable = join_use_nulls && left_or_full_join;
if (!make_nullable)
{
/// Keys from right table are usually not stored in Join, but copied from the left one.
/// So, if left key is nullable, let's make right key nullable too.
/// Note: for some join types it's not needed and, probably, may be removed.
/// Note: changing this code, take into account the implementation in Join.cpp.
auto it = std::find(key_names_right.begin(), key_names_right.end(), col.name);
if (it != key_names_right.end())
{
auto pos = it - key_names_right.begin();
const auto & left_key_name = key_names_left[pos];
make_nullable = sample_block.getByName(left_key_name).type->isNullable();
}
}
if (make_nullable && res_type->canBeInsideNullable())
res_type = makeNullable(res_type);
sample_block.insert(ColumnWithTypeAndName(nullptr, res_type, col.name));
}
}
bool AnalyzedJoin::sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y)
{
if (!x && !y)
return true;
if (!x || !y)
return false;
return x->table_join.kind == y->table_join.kind
&& x->table_join.strictness == y->table_join.strictness
&& x->key_names_left == y->key_names_left
&& x->key_names_right == y->key_names_right
&& x->columns_added_by_join == y->columns_added_by_join;
}
JoinPtr AnalyzedJoin::makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const
{
auto join = std::make_shared<Join>(key_names_right, join_use_nulls, size_limits_for_join, table_join.kind, table_join.strictness);
join->setSampleBlock(sample_block);
return join;
}
NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context) NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context)
{ {
NamesAndTypesList names_and_type_list; NamesAndTypesList names_and_type_list;

View File

@ -2,7 +2,8 @@
#include <Core/Names.h> #include <Core/Names.h>
#include <Core/NamesAndTypes.h> #include <Core/NamesAndTypes.h>
#include <Parsers/IAST.h> #include <Core/SettingsCommon.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <utility> #include <utility>
#include <memory> #include <memory>
@ -13,6 +14,10 @@ namespace DB
class Context; class Context;
class ASTSelectQuery; class ASTSelectQuery;
struct DatabaseAndTableWithAlias; struct DatabaseAndTableWithAlias;
class Block;
class Join;
using JoinPtr = std::shared_ptr<Join>;
struct AnalyzedJoin struct AnalyzedJoin
{ {
@ -30,18 +35,19 @@ struct AnalyzedJoin
private: private:
friend class SyntaxAnalyzer; friend class SyntaxAnalyzer;
friend struct SyntaxAnalyzerResult;
friend class ExpressionAnalyzer;
friend class SelectQueryExpressionAnalyzer;
Names key_names_left; Names key_names_left;
Names key_names_right; /// Duplicating names are qualified. Names key_names_right; /// Duplicating names are qualified.
ASTs key_asts_left; ASTs key_asts_left;
ASTs key_asts_right; ASTs key_asts_right;
bool with_using = true; ASTTableJoin table_join;
bool join_use_nulls = false;
/// All columns which can be read from joined table. Duplicating names are qualified. /// All columns which can be read from joined table. Duplicating names are qualified.
NamesAndTypesList columns_from_joined_table; NamesAndTypesList columns_from_joined_table;
/// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability
NamesAndTypesList columns_added_by_join;
/// Name -> original name. Names are the same as in columns_from_joined_table list. /// Name -> original name. Names are the same as in columns_from_joined_table list.
std::unordered_map<String, String> original_names; std::unordered_map<String, String> original_names;
/// Original name -> name. Only ranamed columns. /// Original name -> name. Only ranamed columns.
@ -51,8 +57,8 @@ public:
void addUsingKey(const ASTPtr & ast); void addUsingKey(const ASTPtr & ast);
void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast);
bool hasUsing() const { return with_using; } bool hasUsing() const { return table_join.using_expression_list != nullptr; }
bool hasOn() const { return !with_using; } bool hasOn() const { return !hasUsing(); }
NameSet getQualifiedColumnsSet() const; NameSet getQualifiedColumnsSet() const;
NameSet getOriginalColumnsSet() const; NameSet getOriginalColumnsSet() const;
@ -60,6 +66,22 @@ public:
void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix); void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix);
size_t rightKeyInclusion(const String & name) const; size_t rightKeyInclusion(const String & name) const;
void appendRequiredColumns(const Block & sample, NameSet & required_columns) const;
void addJoinedColumn(const NameAndTypePair & joined_column);
void addJoinedColumnsAndCorrectNullability(Block & sample_block) const;
ASTPtr leftKeysList() const;
ASTPtr rightKeysList() const; /// For ON syntax only
Names requiredJoinedNames() const;
const Names & keyNamesLeft() const { return key_names_left; }
const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; }
const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; }
JoinPtr makeHashJoin(const Block & sample_block, const SizeLimits & size_limits_for_join) const;
static bool sameJoin(const AnalyzedJoin * x, const AnalyzedJoin * y);
}; };
struct ASTTableExpression; struct ASTTableExpression;

View File

@ -12,7 +12,6 @@
#include <Functions/IFunction.h> #include <Functions/IFunction.h>
#include <set> #include <set>
#include <optional> #include <optional>
#include <DataTypes/DataTypeNullable.h>
namespace ProfileEvents namespace ProfileEvents
@ -45,7 +44,8 @@ Names ExpressionAction::getNeededColumns() const
res.insert(res.end(), array_joined_columns.begin(), array_joined_columns.end()); res.insert(res.end(), array_joined_columns.begin(), array_joined_columns.end());
res.insert(res.end(), join_key_names_left.begin(), join_key_names_left.end()); if (join_params)
res.insert(res.end(), join_params->keyNamesLeft().begin(), join_params->keyNamesLeft().end());
for (const auto & column : projection) for (const auto & column : projection)
res.push_back(column.first); res.push_back(column.first);
@ -159,20 +159,12 @@ ExpressionAction ExpressionAction::arrayJoin(const NameSet & array_joined_column
return a; return a;
} }
ExpressionAction ExpressionAction::ordinaryJoin( ExpressionAction ExpressionAction::ordinaryJoin(std::shared_ptr<AnalyzedJoin> join_params, std::shared_ptr<const Join> hash_join)
const ASTTableJoin & join_params,
std::shared_ptr<const Join> join_,
const Names & join_key_names_left,
const Names & join_key_names_right,
const NamesAndTypesList & columns_added_by_join_)
{ {
ExpressionAction a; ExpressionAction a;
a.type = JOIN; a.type = JOIN;
a.join = std::move(join_); a.join_params = join_params;
a.join_kind = join_params.kind; a.join = hash_join;
a.join_key_names_left = join_key_names_left;
a.join_key_names_right = join_key_names_right;
a.columns_added_by_join = columns_added_by_join_;
return a; return a;
} }
@ -277,51 +269,7 @@ void ExpressionAction::prepare(Block & sample_block, const Settings & settings,
case JOIN: case JOIN:
{ {
bool is_null_used_as_default = settings.join_use_nulls; join_params->addJoinedColumnsAndCorrectNullability(sample_block);
bool right_or_full_join = isRightOrFull(join_kind);
bool left_or_full_join = isLeftOrFull(join_kind);
for (auto & col : sample_block)
{
/// Materialize column.
/// Column is not empty if it is constant, but after Join all constants will be materialized.
/// So, we need remove constants from header.
if (col.column)
col.column = nullptr;
bool make_nullable = is_null_used_as_default && right_or_full_join;
if (make_nullable && col.type->canBeInsideNullable())
col.type = makeNullable(col.type);
}
for (const auto & col : columns_added_by_join)
{
auto res_type = col.type;
bool make_nullable = is_null_used_as_default && left_or_full_join;
if (!make_nullable)
{
/// Keys from right table are usually not stored in Join, but copied from the left one.
/// So, if left key is nullable, let's make right key nullable too.
/// Note: for some join types it's not needed and, probably, may be removed.
/// Note: changing this code, take into account the implementation in Join.cpp.
auto it = std::find(join_key_names_right.begin(), join_key_names_right.end(), col.name);
if (it != join_key_names_right.end())
{
auto pos = it - join_key_names_right.begin();
const auto & left_key_name = join_key_names_left[pos];
make_nullable = sample_block.getByName(left_key_name).type->isNullable();
}
}
if (make_nullable && res_type->canBeInsideNullable())
res_type = makeNullable(res_type);
sample_block.insert(ColumnWithTypeAndName(nullptr, res_type, col.name));
}
break; break;
} }
@ -527,7 +475,7 @@ void ExpressionAction::execute(Block & block, bool dry_run) const
case JOIN: case JOIN:
{ {
join->joinBlock(block, join_key_names_left, columns_added_by_join); join->joinBlock(block, *join_params);
break; break;
} }
@ -645,9 +593,10 @@ std::string ExpressionAction::toString() const
case JOIN: case JOIN:
ss << "JOIN "; ss << "JOIN ";
for (NamesAndTypesList::const_iterator it = columns_added_by_join.begin(); it != columns_added_by_join.end(); ++it) for (NamesAndTypesList::const_iterator it = join_params->columnsAddedByJoin().begin();
it != join_params->columnsAddedByJoin().end(); ++it)
{ {
if (it != columns_added_by_join.begin()) if (it != join_params->columnsAddedByJoin().begin())
ss << ", "; ss << ", ";
ss << it->name; ss << it->name;
} }
@ -1220,7 +1169,7 @@ BlockInputStreamPtr ExpressionActions::createStreamWithNonJoinedDataIfFullOrRigh
for (const auto & action : actions) for (const auto & action : actions)
if (action.join && isRightOrFull(action.join->getKind())) if (action.join && isRightOrFull(action.join->getKind()))
return action.join->createStreamWithNonJoinedRows( return action.join->createStreamWithNonJoinedRows(
source_header, action.join_key_names_left, action.columns_added_by_join, max_block_size); source_header, *action.join_params, max_block_size);
return {}; return {};
} }
@ -1267,7 +1216,7 @@ UInt128 ExpressionAction::ActionHash::operator()(const ExpressionAction & action
hash.update(col); hash.update(col);
break; break;
case JOIN: case JOIN:
for (const auto & col : action.columns_added_by_join) for (const auto & col : action.join_params->columnsAddedByJoin())
hash.update(col.name); hash.update(col.name);
break; break;
case PROJECT: case PROJECT:
@ -1326,9 +1275,7 @@ bool ExpressionAction::operator==(const ExpressionAction & other) const
&& array_joined_columns == other.array_joined_columns && array_joined_columns == other.array_joined_columns
&& array_join_is_left == other.array_join_is_left && array_join_is_left == other.array_join_is_left
&& join == other.join && join == other.join
&& join_key_names_left == other.join_key_names_left && AnalyzedJoin::sameJoin(join_params.get(), other.join_params.get())
&& join_key_names_right == other.join_key_names_right
&& columns_added_by_join == other.columns_added_by_join
&& projection == other.projection && projection == other.projection
&& is_function_compiled == other.is_function_compiled; && is_function_compiled == other.is_function_compiled;
} }

View File

@ -6,6 +6,7 @@
#include <Core/Settings.h> #include <Core/Settings.h>
#include <DataStreams/IBlockStream_fwd.h> #include <DataStreams/IBlockStream_fwd.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
#include <Interpreters/AnalyzedJoin.h>
#include <Common/SipHash.h> #include <Common/SipHash.h>
#include "config_core.h" #include "config_core.h"
#include <unordered_map> #include <unordered_map>
@ -104,11 +105,8 @@ public:
bool unaligned_array_join = false; bool unaligned_array_join = false;
/// For JOIN /// For JOIN
std::shared_ptr<AnalyzedJoin> join_params = nullptr;
std::shared_ptr<const Join> join; std::shared_ptr<const Join> join;
ASTTableJoin::Kind join_kind;
Names join_key_names_left;
Names join_key_names_right;
NamesAndTypesList columns_added_by_join;
/// For PROJECT. /// For PROJECT.
NamesWithAliases projection; NamesWithAliases projection;
@ -124,9 +122,7 @@ public:
static ExpressionAction project(const Names & projected_columns_); static ExpressionAction project(const Names & projected_columns_);
static ExpressionAction addAliases(const NamesWithAliases & aliased_columns_); static ExpressionAction addAliases(const NamesWithAliases & aliased_columns_);
static ExpressionAction arrayJoin(const NameSet & array_joined_columns, bool array_join_is_left, const Context & context); static ExpressionAction arrayJoin(const NameSet & array_joined_columns, bool array_join_is_left, const Context & context);
static ExpressionAction ordinaryJoin(const ASTTableJoin & join_params, std::shared_ptr<const Join> join_, static ExpressionAction ordinaryJoin(std::shared_ptr<AnalyzedJoin> join_params, std::shared_ptr<const Join> hash_join);
const Names & join_key_names_left, const Names & join_key_names_right,
const NamesAndTypesList & columns_added_by_join_);
/// Which columns necessary to perform this action. /// Which columns necessary to perform this action.
Names getNeededColumns() const; Names getNeededColumns() const;

View File

@ -29,7 +29,6 @@
#include <Interpreters/PredicateExpressionsOptimizer.h> #include <Interpreters/PredicateExpressionsOptimizer.h>
#include <Interpreters/ExternalDictionaries.h> #include <Interpreters/ExternalDictionaries.h>
#include <Interpreters/Set.h> #include <Interpreters/Set.h>
#include <Interpreters/Join.h>
#include <AggregateFunctions/AggregateFunctionFactory.h> #include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/parseAggregateFunctionParameters.h> #include <AggregateFunctions/parseAggregateFunctionParameters.h>
@ -134,14 +133,8 @@ void ExpressionAnalyzer::analyzeAggregation()
const ASTTablesInSelectQueryElement * join = select_query->join(); const ASTTablesInSelectQueryElement * join = select_query->join();
if (join) if (join)
{ {
const auto & table_join = join->table_join->as<ASTTableJoin &>(); getRootActions(analyzedJoin().leftKeysList(), true, temp_actions);
if (table_join.using_expression_list) addJoinAction(temp_actions);
getRootActions(table_join.using_expression_list, true, temp_actions);
if (table_join.on_expression)
for (const auto & key_ast : analyzedJoin().key_asts_left)
getRootActions(key_ast, true, temp_actions);
addJoinAction(table_join, temp_actions);
} }
} }
@ -298,7 +291,8 @@ void SelectQueryExpressionAnalyzer::makeSetsForIndex(const ASTPtr & node)
{ {
NamesAndTypesList temp_columns = sourceColumns(); NamesAndTypesList temp_columns = sourceColumns();
temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end()); temp_columns.insert(temp_columns.end(), array_join_columns.begin(), array_join_columns.end());
temp_columns.insert(temp_columns.end(), columnsAddedByJoin().begin(), columnsAddedByJoin().end()); temp_columns.insert(temp_columns.end(),
analyzedJoin().columnsAddedByJoin().begin(), analyzedJoin().columnsAddedByJoin().end());
ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context); ExpressionActionsPtr temp_actions = std::make_shared<ExpressionActions>(temp_columns, context);
getRootActions(left_in_operand, true, temp_actions); getRootActions(left_in_operand, true, temp_actions);
@ -412,22 +406,10 @@ bool SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActionsChain & cha
return true; return true;
} }
static void appendRequiredColumns(
NameSet & required_columns, const Block & sample, const Names & key_names_right, const NamesAndTypesList & columns_added_by_join)
{
for (auto & column : key_names_right)
if (!sample.has(column))
required_columns.insert(column);
for (auto & column : columns_added_by_join)
if (!sample.has(column.name))
required_columns.insert(column.name);
}
/// It's possible to set nullptr as join for only_types mode /// It's possible to set nullptr as join for only_types mode
void ExpressionAnalyzer::addJoinAction(const ASTTableJoin & join_params, ExpressionActionsPtr & actions, JoinPtr join) const void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, JoinPtr join) const
{ {
actions->add(ExpressionAction::ordinaryJoin(join_params, std::move(join), analyzedJoin().key_names_left, analyzedJoin().key_names_right, columnsAddedByJoin())); actions->add(ExpressionAction::ordinaryJoin(syntax->analyzed_join, join));
} }
bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types) bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_types)
@ -438,16 +420,11 @@ bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, b
SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join); SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join);
ASTPtr left_keys_list = std::make_shared<ASTExpressionList>();
left_keys_list->children = analyzedJoin().key_asts_left;
initChain(chain, sourceColumns()); initChain(chain, sourceColumns());
ExpressionActionsChain::Step & step = chain.steps.back(); ExpressionActionsChain::Step & step = chain.steps.back();
auto & join_params = ast_join->table_join->as<ASTTableJoin &>(); getRootActions(analyzedJoin().leftKeysList(), only_types, step.actions);
addJoinAction(step.actions, subquery_for_set.join);
getRootActions(left_keys_list, only_types, step.actions);
addJoinAction(join_params, step.actions, subquery_for_set.join);
return true; return true;
} }
@ -524,11 +501,9 @@ void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryEle
Names action_columns = joined_block_actions->getRequiredColumns(); Names action_columns = joined_block_actions->getRequiredColumns();
NameSet required_columns(action_columns.begin(), action_columns.end()); NameSet required_columns(action_columns.begin(), action_columns.end());
auto & analyzed_join = analyzedJoin(); analyzedJoin().appendRequiredColumns(joined_block_actions->getSampleBlock(), required_columns);
appendRequiredColumns(
required_columns, joined_block_actions->getSampleBlock(), analyzed_join.key_names_right, columnsAddedByJoin());
auto original_map = analyzed_join.getOriginalColumnsMap(required_columns); auto original_map = analyzedJoin().getOriginalColumnsMap(required_columns);
Names original_columns; Names original_columns;
for (auto & pr : original_map) for (auto & pr : original_map)
original_columns.push_back(pr.second); original_columns.push_back(pr.second);
@ -542,29 +517,16 @@ void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryEle
joined_block_actions->execute(sample_block); joined_block_actions->execute(sample_block);
/// TODO You do not need to set this up when JOIN is only needed on remote servers. /// TODO You do not need to set this up when JOIN is only needed on remote servers.
auto & join_params = join_element.table_join->as<ASTTableJoin &>(); subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join);
subquery_for_set.join = std::make_shared<Join>(analyzedJoin().key_names_right, settings.join_use_nulls,
settings.size_limits_for_join, join_params.kind, join_params.strictness);
subquery_for_set.join->setSampleBlock(sample_block);
subquery_for_set.joined_block_actions = joined_block_actions; subquery_for_set.joined_block_actions = joined_block_actions;
} }
ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const
{ {
/// Create custom expression list with join keys from right table. ASTPtr expression_list = analyzedJoin().rightKeysList();
ASTPtr expression_list = std::make_shared<ASTExpressionList>(); Names required_columns = analyzedJoin().requiredJoinedNames();
ASTs & children = expression_list->children;
if (analyzedJoin().hasOn()) auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columnsFromJoinedTable(), required_columns);
for (const auto & join_right_key : analyzedJoin().key_asts_right)
children.emplace_back(join_right_key);
NameSet required_columns_set(analyzedJoin().key_names_right.begin(), analyzedJoin().key_names_right.end());
for (const auto & joined_column : columnsAddedByJoin())
required_columns_set.insert(joined_column.name);
Names required_columns(required_columns_set.begin(), required_columns_set.end());
auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columns_from_joined_table, required_columns);
return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false);
} }

View File

@ -121,9 +121,8 @@ protected:
SyntaxAnalyzerResultPtr syntax; SyntaxAnalyzerResultPtr syntax;
const StoragePtr & storage() const { return syntax->storage; } /// The main table in FROM clause, if exists. const StoragePtr & storage() const { return syntax->storage; } /// The main table in FROM clause, if exists.
const AnalyzedJoin & analyzedJoin() const { return syntax->analyzed_join; } const AnalyzedJoin & analyzedJoin() const { return *syntax->analyzed_join; }
const NamesAndTypesList & sourceColumns() const { return syntax->required_source_columns; } const NamesAndTypesList & sourceColumns() const { return syntax->required_source_columns; }
const NamesAndTypesList & columnsAddedByJoin() const { return syntax->columns_added_by_join; }
const std::vector<const ASTFunction *> & aggregates() const { return syntax->aggregates; } const std::vector<const ASTFunction *> & aggregates() const { return syntax->aggregates; }
/// Find global subqueries in the GLOBAL IN/JOIN sections. Fills in external_tables. /// Find global subqueries in the GLOBAL IN/JOIN sections. Fills in external_tables.
@ -131,7 +130,7 @@ protected:
void addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool is_left) const; void addMultipleArrayJoinAction(ExpressionActionsPtr & actions, bool is_left) const;
void addJoinAction(const ASTTableJoin & join_params, ExpressionActionsPtr & actions, JoinPtr join = {}) const; void addJoinAction(ExpressionActionsPtr & actions, JoinPtr join = {}) const;
void getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts = false); void getRootActions(const ASTPtr & ast, bool no_subqueries, ExpressionActionsPtr & actions, bool only_consts = false);

View File

@ -10,6 +10,7 @@
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <Interpreters/Join.h> #include <Interpreters/Join.h>
#include <Interpreters/AnalyzedJoin.h>
#include <Interpreters/joinDispatch.h> #include <Interpreters/joinDispatch.h>
#include <Interpreters/NullableUtils.h> #include <Interpreters/NullableUtils.h>
@ -1048,8 +1049,11 @@ void Join::joinGet(Block & block, const String & column_name) const
} }
void Join::joinBlock(Block & block, const Names & key_names_left, const NamesAndTypesList & columns_added_by_join) const void Join::joinBlock(Block & block, const AnalyzedJoin & join_params) const
{ {
const Names & key_names_left = join_params.keyNamesLeft();
const NamesAndTypesList & columns_added_by_join = join_params.columnsAddedByJoin();
std::shared_lock lock(rwlock); std::shared_lock lock(rwlock);
checkTypesOfKeys(block, key_names_left, sample_block_with_keys); checkTypesOfKeys(block, key_names_left, sample_block_with_keys);
@ -1457,10 +1461,11 @@ private:
}; };
BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, const Names & key_names_left, BlockInputStreamPtr Join::createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params,
const NamesAndTypesList & columns_added_by_join, UInt64 max_block_size) const UInt64 max_block_size) const
{ {
return std::make_shared<NonJoinedBlockInputStream>(*this, left_sample_block, key_names_left, columns_added_by_join, max_block_size); return std::make_shared<NonJoinedBlockInputStream>(*this, left_sample_block,
join_params.keyNamesLeft(), join_params.columnsAddedByJoin(), max_block_size);
} }

View File

@ -26,6 +26,8 @@
namespace DB namespace DB
{ {
struct AnalyzedJoin;
namespace JoinStuff namespace JoinStuff
{ {
@ -141,7 +143,7 @@ public:
/** Join data from the map (that was previously built by calls to insertFromBlock) to the block with data from "left" table. /** Join data from the map (that was previously built by calls to insertFromBlock) to the block with data from "left" table.
* Could be called from different threads in parallel. * Could be called from different threads in parallel.
*/ */
void joinBlock(Block & block, const Names & key_names_left, const NamesAndTypesList & columns_added_by_join) const; void joinBlock(Block & block, const AnalyzedJoin & join_params) const;
/// Infer the return type for joinGet function /// Infer the return type for joinGet function
DataTypePtr joinGetReturnType(const String & column_name) const; DataTypePtr joinGetReturnType(const String & column_name) const;
@ -161,8 +163,8 @@ public:
* Use only after all calls to joinBlock was done. * Use only after all calls to joinBlock was done.
* left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside). * left_sample_block is passed without account of 'use_nulls' setting (columns will be converted to Nullable inside).
*/ */
BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, const Names & key_names_left, BlockInputStreamPtr createStreamWithNonJoinedRows(const Block & left_sample_block, const AnalyzedJoin & join_params,
const NamesAndTypesList & columns_added_by_join, UInt64 max_block_size) const; UInt64 max_block_size) const;
/// Number of keys in all built JOIN maps. /// Number of keys in all built JOIN maps.
size_t getTotalRowCount() const; size_t getTotalRowCount() const;

View File

@ -489,14 +489,13 @@ void getArrayJoinedColumns(ASTPtr & query, SyntaxAnalyzerResult & result, const
} }
} }
void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, ASTTableJoin::Kind & join_kind) void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, ASTTableJoin & out_table_join)
{ {
const ASTTablesInSelectQueryElement * node = select_query.join(); const ASTTablesInSelectQueryElement * node = select_query.join();
if (!node) if (!node)
return; return;
auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>(); auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>();
join_kind = table_join.kind;
if (table_join.strictness == ASTTableJoin::Strictness::Unspecified && if (table_join.strictness == ASTTableJoin::Strictness::Unspecified &&
table_join.kind != ASTTableJoin::Kind::Cross) table_join.kind != ASTTableJoin::Kind::Cross)
@ -509,6 +508,8 @@ void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_defaul
throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty", throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty",
DB::ErrorCodes::EXPECTED_ALL_OR_ANY); DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
} }
out_table_join = table_join;
} }
/// Find the columns that are obtained by JOIN. /// Find the columns that are obtained by JOIN.
@ -609,8 +610,7 @@ std::vector<const ASTFunction *> getAggregates(const ASTPtr & query)
/// Calculate which columns are required to execute the expression. /// Calculate which columns are required to execute the expression.
/// Then, delete all other columns from the list of available columns. /// Then, delete all other columns from the list of available columns.
/// After execution, columns will only contain the list of columns needed to read from the table. /// After execution, columns will only contain the list of columns needed to read from the table.
void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns, void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns)
bool make_joined_columns_nullable)
{ {
/// We caclulate required_source_columns with source_columns modifications and swap them on exit /// We caclulate required_source_columns with source_columns modifications and swap them on exit
required_source_columns = source_columns; required_source_columns = source_columns;
@ -637,8 +637,7 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
avaliable_columns.insert(name.name); avaliable_columns.insert(name.name);
/// Add columns obtained by JOIN (if needed). /// Add columns obtained by JOIN (if needed).
columns_added_by_join.clear(); for (const auto & joined_column : analyzed_join->columnsFromJoinedTable())
for (const auto & joined_column : analyzed_join.columns_from_joined_table)
{ {
auto & name = joined_column.name; auto & name = joined_column.name;
if (avaliable_columns.count(name)) if (avaliable_columns.count(name))
@ -647,16 +646,9 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
if (required.count(name)) if (required.count(name))
{ {
/// Optimisation: do not add columns needed only in JOIN ON section. /// Optimisation: do not add columns needed only in JOIN ON section.
if (columns_context.nameInclusion(name) > analyzed_join.rightKeyInclusion(name)) if (columns_context.nameInclusion(name) > analyzed_join->rightKeyInclusion(name))
{ analyzed_join->addJoinedColumn(joined_column);
if (make_joined_columns_nullable)
{
auto type = joined_column.type->canBeInsideNullable() ? makeNullable(joined_column.type) : joined_column.type;
columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, std::move(type)));
}
else
columns_added_by_join.push_back(joined_column);
}
required.erase(name); required.erase(name);
} }
} }
@ -766,7 +758,7 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
if (columns_context.has_table_join) if (columns_context.has_table_join)
{ {
ss << ", joined columns:"; ss << ", joined columns:";
for (const auto & column : analyzed_join.columns_from_joined_table) for (const auto & column : analyzed_join->columnsFromJoinedTable())
ss << " '" << column.name << "'"; ss << " '" << column.name << "'";
} }
@ -798,15 +790,17 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
storage = context.tryGetTable(db_and_table->database, db_and_table->table); storage = context.tryGetTable(db_and_table->database, db_and_table->table);
} }
const auto & settings = context.getSettingsRef();
SyntaxAnalyzerResult result; SyntaxAnalyzerResult result;
result.storage = storage; result.storage = storage;
result.source_columns = source_columns_; result.source_columns = source_columns_;
result.analyzed_join = std::make_shared<AnalyzedJoin>(); /// TODO: move to select_query logic
result.analyzed_join->join_use_nulls = settings.join_use_nulls;
collectSourceColumns(select_query, result.storage, result.source_columns); collectSourceColumns(select_query, result.storage, result.source_columns);
NameSet source_columns_set = removeDuplicateColumns(result.source_columns); NameSet source_columns_set = removeDuplicateColumns(result.source_columns);
const auto & settings = context.getSettingsRef();
Names source_columns_list; Names source_columns_list;
source_columns_list.reserve(result.source_columns.size()); source_columns_list.reserve(result.source_columns.size());
for (const auto & type_name : result.source_columns) for (const auto & type_name : result.source_columns)
@ -831,13 +825,13 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
const auto & joined_expression = node->table_expression->as<ASTTableExpression &>(); const auto & joined_expression = node->table_expression->as<ASTTableExpression &>();
DatabaseAndTableWithAlias table(joined_expression, context.getCurrentDatabase()); DatabaseAndTableWithAlias table(joined_expression, context.getCurrentDatabase());
result.analyzed_join.columns_from_joined_table = getNamesAndTypeListFromTableExpression(joined_expression, context); result.analyzed_join->columns_from_joined_table = getNamesAndTypeListFromTableExpression(joined_expression, context);
result.analyzed_join.deduplicateAndQualifyColumnNames(source_columns_set, table.getQualifiedNamePrefix()); result.analyzed_join->deduplicateAndQualifyColumnNames(source_columns_set, table.getQualifiedNamePrefix());
} }
translateQualifiedNames(query, *select_query, context, translateQualifiedNames(query, *select_query, context,
(storage ? storage->getColumns().getOrdinary().getNames() : source_columns_list), source_columns_set, (storage ? storage->getColumns().getOrdinary().getNames() : source_columns_list), source_columns_set,
result.analyzed_join.getQualifiedColumnsSet()); result.analyzed_join->getQualifiedColumnsSet());
/// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting. /// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting.
InJoinSubqueriesPreprocessor(context).visit(query); InJoinSubqueriesPreprocessor(context).visit(query);
@ -872,7 +866,6 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
/// Optimize if with constant condition after constants was substituted instead of scalar subqueries. /// Optimize if with constant condition after constants was substituted instead of scalar subqueries.
OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query); OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query);
bool make_joined_columns_nullable = false;
if (select_query) if (select_query)
{ {
/// GROUP BY injective function elimination. /// GROUP BY injective function elimination.
@ -893,15 +886,12 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
/// Push the predicate expression down to the subqueries. /// Push the predicate expression down to the subqueries.
result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize(); result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize();
ASTTableJoin::Kind join_kind = ASTTableJoin::Kind::Comma; setJoinStrictness(*select_query, settings.join_default_strictness, result.analyzed_join->table_join);
setJoinStrictness(*select_query, settings.join_default_strictness, join_kind); collectJoinedColumns(*result.analyzed_join, *select_query, source_columns_set, result.aliases);
make_joined_columns_nullable = settings.join_use_nulls && isLeftOrFull(join_kind);
collectJoinedColumns(result.analyzed_join, *select_query, source_columns_set, result.aliases);
} }
result.aggregates = getAggregates(query); result.aggregates = getAggregates(query);
result.collectUsedColumns(query, additional_source_columns, make_joined_columns_nullable); result.collectUsedColumns(query, additional_source_columns);
return std::make_shared<const SyntaxAnalyzerResult>(result); return std::make_shared<const SyntaxAnalyzerResult>(result);
} }

View File

@ -15,13 +15,11 @@ class ASTFunction;
struct SyntaxAnalyzerResult struct SyntaxAnalyzerResult
{ {
StoragePtr storage; StoragePtr storage;
AnalyzedJoin analyzed_join; std::shared_ptr<AnalyzedJoin> analyzed_join;
NamesAndTypesList source_columns; NamesAndTypesList source_columns;
/// Set of columns that are enough to read from the table to evaluate the expression. It does not include joined columns. /// Set of columns that are enough to read from the table to evaluate the expression. It does not include joined columns.
NamesAndTypesList required_source_columns; NamesAndTypesList required_source_columns;
/// Columns will be added to block by JOIN. It's a subset of analyzed_join.columns_from_joined_table with corrected Nullability
NamesAndTypesList columns_added_by_join;
Aliases aliases; Aliases aliases;
std::vector<const ASTFunction *> aggregates; std::vector<const ASTFunction *> aggregates;
@ -42,7 +40,7 @@ struct SyntaxAnalyzerResult
/// Predicate optimizer overrides the sub queries /// Predicate optimizer overrides the sub queries
bool rewrite_subqueries = false; bool rewrite_subqueries = false;
void collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns, bool make_joined_columns_nullable); void collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns);
Names requiredSourceColumns() const { return required_source_columns.getNames(); } Names requiredSourceColumns() const { return required_source_columns.getNames(); }
}; };

View File

@ -287,12 +287,6 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, Stack & children, Stack & pa
switch (node.last_processor_status) switch (node.last_processor_status)
{ {
case IProcessor::Status::NeedData: case IProcessor::Status::NeedData:
{
add_neighbours_to_prepare_queue();
try_release_ownership();
break;
}
case IProcessor::Status::PortFull: case IProcessor::Status::PortFull:
{ {
add_neighbours_to_prepare_queue(); add_neighbours_to_prepare_queue();

View File

@ -174,7 +174,7 @@ Chunk IRowInputFormat::generate()
{ {
if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0) if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0)
{ {
Logger * log = &Logger::get("BlockInputStreamFromRowInputStream"); Logger * log = &Logger::get("IRowInputFormat");
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream"); LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
} }

View File

@ -13,6 +13,8 @@
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Interpreters/castColumn.h> #include <Interpreters/castColumn.h>
#include <algorithm>
namespace DB namespace DB
{ {
@ -27,34 +29,28 @@ namespace DB
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN; extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
extern const int THERE_IS_NO_COLUMN; extern const int THERE_IS_NO_COLUMN;
} }
const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> arrow_type_to_internal_type = {
//{arrow::Type::DECIMAL, std::make_shared<DataTypeDecimal>()},
{arrow::Type::UINT8, std::make_shared<DataTypeUInt8>()},
{arrow::Type::INT8, std::make_shared<DataTypeInt8>()},
{arrow::Type::UINT16, std::make_shared<DataTypeUInt16>()},
{arrow::Type::INT16, std::make_shared<DataTypeInt16>()},
{arrow::Type::UINT32, std::make_shared<DataTypeUInt32>()},
{arrow::Type::INT32, std::make_shared<DataTypeInt32>()},
{arrow::Type::UINT64, std::make_shared<DataTypeUInt64>()},
{arrow::Type::INT64, std::make_shared<DataTypeInt64>()},
{arrow::Type::HALF_FLOAT, std::make_shared<DataTypeFloat32>()},
{arrow::Type::FLOAT, std::make_shared<DataTypeFloat32>()},
{arrow::Type::DOUBLE, std::make_shared<DataTypeFloat64>()},
{arrow::Type::BOOL, std::make_shared<DataTypeUInt8>()}, static const std::initializer_list<std::pair<arrow::Type::type, const char *>> arrow_type_to_internal_type =
//{arrow::Type::DATE32, std::make_shared<DataTypeDate>()}, {
{arrow::Type::DATE32, std::make_shared<DataTypeDate>()}, {arrow::Type::UINT8, "UInt8"},
//{arrow::Type::DATE32, std::make_shared<DataTypeDateTime>()}, {arrow::Type::INT8, "Int8"},
{arrow::Type::DATE64, std::make_shared<DataTypeDateTime>()}, {arrow::Type::UINT16, "UInt16"},
{arrow::Type::TIMESTAMP, std::make_shared<DataTypeDateTime>()}, {arrow::Type::INT16, "Int16"},
//{arrow::Type::TIME32, std::make_shared<DataTypeDateTime>()}, {arrow::Type::UINT32, "UInt32"},
{arrow::Type::INT32, "Int32"},
{arrow::Type::UINT64, "UInt64"},
{arrow::Type::INT64, "Int64"},
{arrow::Type::HALF_FLOAT, "Float32"},
{arrow::Type::FLOAT, "Float32"},
{arrow::Type::DOUBLE, "Float64"},
{arrow::Type::BOOL, "UInt8"},
{arrow::Type::DATE32, "Date"},
{arrow::Type::DATE64, "DateTime"},
{arrow::Type::TIMESTAMP, "DateTime"},
{arrow::Type::STRING, std::make_shared<DataTypeString>()}, {arrow::Type::STRING, "String"},
{arrow::Type::BINARY, std::make_shared<DataTypeString>()}, {arrow::Type::BINARY, "String"},
//{arrow::Type::FIXED_SIZE_BINARY, std::make_shared<DataTypeString>()},
//{arrow::Type::UUID, std::make_shared<DataTypeString>()},
// TODO: add other types that are convertable to internal ones: // TODO: add other types that are convertable to internal ones:
// 0. ENUM? // 0. ENUM?
@ -253,7 +249,7 @@ namespace DB
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk &res, std::shared_ptr<arrow::Table> &table, void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk &res, std::shared_ptr<arrow::Table> &table,
arrow::Status &read_status, const Block &header, arrow::Status &read_status, const Block &header,
int &row_group_current, const Context &context, std::string format_name) int &row_group_current, const Context &context, std::string format_name)
{ {
Columns columns_list; Columns columns_list;
UInt64 num_rows = 0; UInt64 num_rows = 0;
@ -308,15 +304,16 @@ namespace DB
const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get()); const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(), internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(),
decimal_type->scale()); decimal_type->scale());
} else if (arrow_type_to_internal_type.find(arrow_type) != arrow_type_to_internal_type.end()) }
else if (auto internal_type_it = std::find_if(arrow_type_to_internal_type.begin(), arrow_type_to_internal_type.end(),
[=](auto && elem) { return elem.first == arrow_type; });
internal_type_it != arrow_type_to_internal_type.end())
{ {
internal_nested_type = arrow_type_to_internal_type.at(arrow_type); internal_nested_type = DataTypeFactory::instance().get(internal_type_it->second);
} }
else else
{ {
throw Exception throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
{
"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
+ "\" is not supported for conversion from a " + format_name + " data format", + "\" is not supported for conversion from a " + format_name + " data format",
ErrorCodes::CANNOT_CONVERT_TYPE}; ErrorCodes::CANNOT_CONVERT_TYPE};
} }

View File

@ -5,6 +5,7 @@
#include <Processors/Formats/Impl/CSVRowInputFormat.h> #include <Processors/Formats/Impl/CSVRowInputFormat.h>
#include <Formats/FormatFactory.h> #include <Formats/FormatFactory.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeNothing.h>
namespace DB namespace DB
@ -17,9 +18,9 @@ namespace ErrorCodes
} }
CSVRowInputFormat::CSVRowInputFormat( CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_) bool with_names_, const FormatSettings & format_settings_)
: IRowInputFormat(std::move(header_), in_, std::move(params_)) : RowInputFormatWithDiagnosticInfo(header_, in_, params_)
, with_names(with_names_) , with_names(with_names_)
, format_settings(format_settings_) , format_settings(format_settings_)
{ {
@ -79,72 +80,72 @@ void CSVRowInputFormat::addInputColumn(const String & column_name)
column_indexes_for_input_fields.emplace_back(column_index); column_indexes_for_input_fields.emplace_back(column_index);
} }
static void skipEndOfLine(ReadBuffer & istr) static void skipEndOfLine(ReadBuffer & in)
{ {
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
if (*istr.position() == '\n') if (*in.position() == '\n')
{ {
++istr.position(); ++in.position();
if (!istr.eof() && *istr.position() == '\r') if (!in.eof() && *in.position() == '\r')
++istr.position(); ++in.position();
} }
else if (*istr.position() == '\r') else if (*in.position() == '\r')
{ {
++istr.position(); ++in.position();
if (!istr.eof() && *istr.position() == '\n') if (!in.eof() && *in.position() == '\n')
++istr.position(); ++in.position();
else else
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
} }
else if (!istr.eof()) else if (!in.eof())
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
} }
static void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column) static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column)
{ {
if (is_last_column) if (is_last_column)
{ {
if (istr.eof()) if (in.eof())
return; return;
/// we support the extra delimiter at the end of the line /// we support the extra delimiter at the end of the line
if (*istr.position() == delimiter) if (*in.position() == delimiter)
{ {
++istr.position(); ++in.position();
if (istr.eof()) if (in.eof())
return; return;
} }
skipEndOfLine(istr); skipEndOfLine(in);
} }
else else
assertChar(delimiter, istr); assertChar(delimiter, in);
} }
/// Skip `whitespace` symbols allowed in CSV. /// Skip `whitespace` symbols allowed in CSV.
static inline void skipWhitespacesAndTabs(ReadBuffer & buf) static inline void skipWhitespacesAndTabs(ReadBuffer & in)
{ {
while (!buf.eof() while (!in.eof()
&& (*buf.position() == ' ' && (*in.position() == ' '
|| *buf.position() == '\t')) || *in.position() == '\t'))
++buf.position(); ++in.position();
} }
static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, size_t num_columns) static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns)
{ {
String tmp; String tmp;
for (size_t i = 0; i < num_columns; ++i) for (size_t i = 0; i < num_columns; ++i)
{ {
skipWhitespacesAndTabs(istr); skipWhitespacesAndTabs(in);
readCSVString(tmp, istr, settings); readCSVString(tmp, in, settings);
skipWhitespacesAndTabs(istr); skipWhitespacesAndTabs(in);
skipDelimiter(istr, settings.delimiter, i + 1 == num_columns); skipDelimiter(in, settings.delimiter, i + 1 == num_columns);
} }
} }
@ -156,7 +157,6 @@ void CSVRowInputFormat::readPrefix()
skipBOMIfExists(in); skipBOMIfExists(in);
size_t num_columns = data_types.size(); size_t num_columns = data_types.size();
String tmp;
auto & header = getPort().getHeader(); auto & header = getPort().getHeader();
if (with_names) if (with_names)
@ -224,8 +224,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{ {
const auto & table_column = column_indexes_for_input_fields[file_column]; const auto & table_column = column_indexes_for_input_fields[file_column];
const bool is_last_file_column = const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
file_column + 1 == column_indexes_for_input_fields.size();
if (table_column) if (table_column)
{ {
@ -267,71 +266,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
return true; return true;
} }
bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
String CSVRowInputFormat::getDiagnosticInfo()
{
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
return {};
WriteBufferFromOwnString out;
auto & header = getPort().getHeader();
MutableColumns columns = header.cloneEmptyColumns();
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
return out.str();
}
size_t max_length_of_column_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = header.safeGetByPosition(i).name.size();
size_t max_length_of_data_type_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
if (pos_of_prev_row)
{
in.position() = pos_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
return out.str();
}
else
{
if (!pos_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return out.str();
}
in.position() = pos_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
out << "\n";
return out.str();
}
/** gcc-7 generates wrong code with optimization level greater than 1.
* See tests: dbms/src/IO/tests/write_int.cpp
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
* Nevertheless, we don't need high optimization of this function.
*/
bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{ {
const char delimiter = format_settings.csv.delimiter; const char delimiter = format_settings.csv.delimiter;
@ -345,100 +280,19 @@ bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumn
if (column_indexes_for_input_fields[file_column].has_value()) if (column_indexes_for_input_fields[file_column].has_value())
{ {
const auto & table_column = *column_indexes_for_input_fields[file_column];
const auto & current_column_type = data_types[table_column];
const bool is_last_file_column =
file_column + 1 == column_indexes_for_input_fields.size();
const bool at_delimiter = !in.eof() && *in.position() == delimiter;
const bool at_last_column_line_end = is_last_file_column
&& (in.eof() || *in.position() == '\n' || *in.position() == '\r');
auto & header = getPort().getHeader(); auto & header = getPort().getHeader();
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') size_t col_idx = column_indexes_for_input_fields[file_column].value();
<< "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ') if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); out, file_column))
return false;
if (format_settings.csv.empty_as_default
&& (at_delimiter || at_last_column_line_end))
{
columns[table_column]->insertDefault();
}
else
{
BufferBase::Position prev_position = in.position();
BufferBase::Position curr_position = in.position();
std::exception_ptr exception;
try
{
skipWhitespacesAndTabs(in);
prev_position = in.position();
readField(*columns[table_column], current_column_type, is_last_file_column, table_column);
curr_position = in.position();
skipWhitespacesAndTabs(in);
}
catch (...)
{
exception = std::current_exception();
}
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue()
&& *curr_position != '\n' && *curr_position != '\r'
&& *curr_position != delimiter)
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
} }
else else
{ {
static const String skipped_column_str = "<SKIPPED COLUMN>"; static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ') static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
String tmp;
readCSVString(tmp, in, format_settings.csv);
} }
/// Delimiters /// Delimiters
@ -502,15 +356,26 @@ void CSVRowInputFormat::syncAfterError()
skipToNextLineOrEOF(in); skipToNextLineOrEOF(in);
} }
void CSVRowInputFormat::updateDiagnosticInfo() void CSVRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
{ {
++row_num; skipWhitespacesAndTabs(in);
prev_pos = in.position();
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row; if (column_indexes_for_input_fields[file_column])
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset(); {
const bool is_last_file_column = file_column + 1 == column_indexes_for_input_fields.size();
if (!readField(column, type, is_last_file_column, *column_indexes_for_input_fields[file_column]))
column.insertDefault();
}
else
{
String tmp;
readCSVString(tmp, in, format_settings.csv);
}
pos_of_prev_row = pos_of_current_row; curr_pos = in.position();
pos_of_current_row = in.position(); skipWhitespacesAndTabs(in);
} }
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx) bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
@ -563,7 +428,7 @@ void registerInputFormatProcessorCSV(FormatFactory & factory)
IRowInputFormat::Params params, IRowInputFormat::Params params,
const FormatSettings & settings) const FormatSettings & settings)
{ {
return std::make_shared<CSVRowInputFormat>(buf, sample, std::move(params), with_names, settings); return std::make_shared<CSVRowInputFormat>(sample, buf, params, with_names, settings);
}); });
} }
} }

View File

@ -1,40 +1,38 @@
#pragma once #pragma once
#include <optional>
#include <unordered_map>
#include <Core/Block.h> #include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h> #include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Formats/FormatSettings.h> #include <Formats/FormatSettings.h>
namespace DB namespace DB
{ {
class ReadBuffer;
/** A stream for inputting data in csv format. /** A stream for inputting data in csv format.
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
*/ */
class CSVRowInputFormat : public IRowInputFormat class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo
{ {
public: public:
/** with_names - in the first line the header with column names /** with_names - in the first line the header with column names
* with_types - on the next line header with type names
*/ */
CSVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, bool with_names_, const FormatSettings & format_settings_); CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
bool with_names_, const FormatSettings & format_settings_);
String getName() const override { return "CSVRowInputFormat"; } String getName() const override { return "CSVRowInputFormat"; }
bool readRow(MutableColumns & columns, RowReadExtension &) override; bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override; void readPrefix() override;
bool allowSyncAfterError() const override { return true; } bool allowSyncAfterError() const override { return true; }
void syncAfterError() override; void syncAfterError() override;
std::string getDiagnosticInfo() override;
private: private:
bool with_names; bool with_names;
DataTypes data_types;
const FormatSettings format_settings; const FormatSettings format_settings;
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>; using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names; IndexesMap column_indexes_by_names;
@ -43,7 +41,7 @@ private:
using OptionalIndexes = std::vector<std::optional<size_t>>; using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields; OptionalIndexes column_indexes_for_input_fields;
/// Tracks which colums we have read in a single read() call. /// Tracks which columns we have read in a single read() call.
/// For columns that are never read, it is initialized to false when we /// For columns that are never read, it is initialized to false when we
/// read the file header, and never changed afterwards. /// read the file header, and never changed afterwards.
/// For other columns, it is updated on each read() call. /// For other columns, it is updated on each read() call.
@ -55,26 +53,19 @@ private:
void addInputColumn(const String & column_name); void addInputColumn(const String & column_name);
/// For convenient diagnostics in case of an error. bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
size_t row_num = 0; void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
/// How many bytes were read, not counting those that are still in the buffer. bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
size_t bytes_read_at_start_of_buffer_on_current_row = 0; {
size_t bytes_read_at_start_of_buffer_on_prev_row = 0; return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter;
}
char * pos_of_current_row = nullptr;
char * pos_of_prev_row = nullptr;
/// For setting input_format_null_as_default /// For setting input_format_null_as_default
DataTypes nullable_types; DataTypes nullable_types;
MutableColumns nullable_columns; MutableColumns nullable_columns;
OptionalIndexes column_idx_to_nullable_column_idx; OptionalIndexes column_idx_to_nullable_column_idx;
void updateDiagnosticInfo();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx); bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
}; };

View File

@ -5,6 +5,7 @@
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h> #include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include <Formats/verbosePrintString.h> #include <Formats/verbosePrintString.h>
#include <Formats/FormatFactory.h> #include <Formats/FormatFactory.h>
#include <DataTypes/DataTypeNothing.h>
namespace DB namespace DB
{ {
@ -16,23 +17,23 @@ namespace ErrorCodes
} }
static void skipTSVRow(ReadBuffer & istr, const size_t num_columns) static void skipTSVRow(ReadBuffer & in, const size_t num_columns)
{ {
NullSink null_sink; NullSink null_sink;
for (size_t i = 0; i < num_columns; ++i) for (size_t i = 0; i < num_columns; ++i)
{ {
readEscapedStringInto(null_sink, istr); readEscapedStringInto(null_sink, in);
assertChar(i == num_columns - 1 ? '\n' : '\t', istr); assertChar(i == num_columns - 1 ? '\n' : '\t', in);
} }
} }
/** Check for a common error case - usage of Windows line feed. /** Check for a common error case - usage of Windows line feed.
*/ */
static void checkForCarriageReturn(ReadBuffer & istr) static void checkForCarriageReturn(ReadBuffer & in)
{ {
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r')) if (in.position()[0] == '\r' || (in.position() != in.buffer().begin() && in.position()[-1] == '\r'))
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
" You must transform your file to Unix format." " You must transform your file to Unix format."
@ -41,9 +42,9 @@ static void checkForCarriageReturn(ReadBuffer & istr)
} }
TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_) bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: IRowInputFormat(std::move(header_), in_, std::move(params_)), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) : RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
{ {
auto & sample = getPort().getHeader(); auto & sample = getPort().getHeader();
size_t num_columns = sample.columns(); size_t num_columns = sample.columns();
@ -173,9 +174,9 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
updateDiagnosticInfo(); updateDiagnosticInfo();
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{ {
const auto & column_index = column_indexes_for_input_fields[input_position]; const auto & column_index = column_indexes_for_input_fields[file_column];
if (column_index) if (column_index)
{ {
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], in, format_settings); data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], in, format_settings);
@ -187,7 +188,7 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
} }
/// skip separators /// skip separators
if (input_position + 1 < column_indexes_for_input_fields.size()) if (file_column + 1 < column_indexes_for_input_fields.size())
{ {
assertChar('\t', in); assertChar('\t', in);
} }
@ -205,160 +206,35 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
return true; return true;
} }
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
String TabSeparatedRowInputFormat::getDiagnosticInfo()
{ {
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
return {};
auto & header = getPort().getHeader();
WriteBufferFromOwnString out;
MutableColumns columns = header.cloneEmptyColumns();
/// It is possible to display detailed diagnostics only if the last and next to last lines are still in the read buffer.
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{ {
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; if (file_column == 0 && in.eof())
return out.str();
}
size_t max_length_of_column_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = header.safeGetByPosition(i).name.size();
size_t max_length_of_data_type_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current line and pars all over again. But now we derive detailed information.
if (pos_of_prev_row)
{
in.position() = pos_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name))
return out.str();
}
else
{
if (!pos_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return out.str();
}
in.position() = pos_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(columns, out, max_length_of_column_name, max_length_of_data_type_name);
out << "\n";
return out.str();
}
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
if (input_position == 0 && in.eof())
{ {
out << "<End of stream>\n"; out << "<End of stream>\n";
return false; return false;
} }
if (column_indexes_for_input_fields[input_position].has_value()) if (column_indexes_for_input_fields[file_column].has_value())
{ {
const auto & column_index = *column_indexes_for_input_fields[input_position]; auto & header = getPort().getHeader();
const auto & current_column_type = data_types[column_index]; size_t col_idx = column_indexes_for_input_fields[file_column].value();
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
const auto & header = getPort().getHeader(); out, file_column))
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
auto prev_position = in.position();
std::exception_ptr exception;
try
{
current_column_type->deserializeAsTextEscaped(*columns[column_index], in, format_settings);
}
catch (...)
{
exception = std::current_exception();
}
auto curr_position = in.position();
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false; return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\t')
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
} }
else else
{ {
static const String skipped_column_str = "<SKIPPED COLUMN>"; static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
NullSink null_sink;
readEscapedStringInto(null_sink, in);
} }
/// Delimiters /// Delimiters
if (input_position + 1 == column_indexes_for_input_fields.size()) if (file_column + 1 == column_indexes_for_input_fields.size())
{ {
if (!in.eof()) if (!in.eof())
{ {
@ -401,7 +277,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
{ {
out << "ERROR: Line feed found where tab is expected." out << "ERROR: Line feed found where tab is expected."
" It's like your file has less columns than expected.\n" " It's like your file has less columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n"; "And if your file have right number of columns, "
"maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
} }
else if (*in.position() == '\r') else if (*in.position() == '\r')
{ {
@ -421,6 +298,19 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns &
return true; return true;
} }
void TabSeparatedRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
{
prev_pos = in.position();
if (column_indexes_for_input_fields[file_column])
type->deserializeAsTextEscaped(column, in, format_settings);
else
{
NullSink null_sink;
readEscapedStringInto(null_sink, in);
}
curr_pos = in.position();
}
void TabSeparatedRowInputFormat::syncAfterError() void TabSeparatedRowInputFormat::syncAfterError()
{ {
@ -428,18 +318,6 @@ void TabSeparatedRowInputFormat::syncAfterError()
} }
void TabSeparatedRowInputFormat::updateDiagnosticInfo()
{
++row_num;
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset();
pos_of_prev_row = pos_of_current_row;
pos_of_current_row = in.position();
}
void registerInputFormatProcessorTabSeparated(FormatFactory & factory) void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
{ {
for (auto name : {"TabSeparated", "TSV"}) for (auto name : {"TabSeparated", "TSV"})
@ -451,7 +329,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
IRowInputFormat::Params params, IRowInputFormat::Params params,
const FormatSettings & settings) const FormatSettings & settings)
{ {
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, false, false, std::move(params), settings); return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, false, false, settings);
}); });
} }
@ -464,7 +342,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
IRowInputFormat::Params params, IRowInputFormat::Params params,
const FormatSettings & settings) const FormatSettings & settings)
{ {
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, true, false, std::move(params), settings); return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, false, settings);
}); });
} }
@ -477,7 +355,7 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
IRowInputFormat::Params params, IRowInputFormat::Params params,
const FormatSettings & settings) const FormatSettings & settings)
{ {
return std::make_shared<TabSeparatedRowInputFormat>(buf, sample, true, true, std::move(params), settings); return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, true, settings);
}); });
} }
} }

View File

@ -2,25 +2,22 @@
#include <Core/Block.h> #include <Core/Block.h>
#include <Formats/FormatSettings.h> #include <Formats/FormatSettings.h>
#include <Processors/Formats/IRowInputFormat.h> #include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
namespace DB namespace DB
{ {
class ReadBuffer;
/** A stream to input data in tsv format. /** A stream to input data in tsv format.
*/ */
class TabSeparatedRowInputFormat : public IRowInputFormat class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo
{ {
public: public:
/** with_names - the first line is the header with the names of the columns /** with_names - the first line is the header with the names of the columns
* with_types - on the next line header with type names * with_types - on the next line header with type names
*/ */
TabSeparatedRowInputFormat( TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
ReadBuffer & in_, Block header_, bool with_names_, bool with_types_, Params params_, const FormatSettings & format_settings_); bool with_names_, bool with_types_, const FormatSettings & format_settings_);
String getName() const override { return "TabSeparatedRowInputFormat"; } String getName() const override { return "TabSeparatedRowInputFormat"; }
@ -29,8 +26,6 @@ public:
bool allowSyncAfterError() const override { return true; } bool allowSyncAfterError() const override { return true; }
void syncAfterError() override; void syncAfterError() override;
std::string getDiagnosticInfo() override;
private: private:
bool with_names; bool with_names;
bool with_types; bool with_types;
@ -50,21 +45,10 @@ private:
void setupAllColumnsByTableSchema(); void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext); void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
/// For convenient diagnostics in case of an error. bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
size_t row_num = 0; ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
/// How many bytes were read, not counting those still in the buffer.
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
char * pos_of_current_row = nullptr;
char * pos_of_prev_row = nullptr;
void updateDiagnosticInfo();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
}; };
} }

View File

@ -0,0 +1,243 @@
#include <Processors/Formats/Impl/TemplateBlockOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{
namespace ErrorCodes
{
extern const int SYNTAX_ERROR;
}
TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_)
: IOutputFormat(header_, out_), settings(settings_)
{
auto & sample = getPort(PortKind::Main).getHeader();
size_t columns = sample.columns();
types.resize(columns);
for (size_t i = 0; i < columns; ++i)
types[i] = sample.safeGetByPosition(i).type;
/// Parse format string for whole output
static const String default_format("${data}");
const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format;
format = ParsedTemplateFormatString(format_str, [&](const String & partName)
{
return static_cast<size_t>(stringToOutputPart(partName));
});
/// Validate format string for whole output
size_t data_idx = format.format_idx_to_column_idx.size() + 1;
for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i)
{
if (!format.format_idx_to_column_idx[i])
format.throwInvalidFormat("Output part name cannot be empty, it's a bug.", i);
switch (static_cast<OutputPart>(*format.format_idx_to_column_idx[i]))
{
case OutputPart::Data:
data_idx = i;
[[fallthrough]];
case OutputPart::Totals:
case OutputPart::ExtremesMin:
case OutputPart::ExtremesMax:
if (format.formats[i] != ColumnFormat::None)
format.throwInvalidFormat("Serialization type for data, totals, min and max must be empty or None", i);
break;
default:
if (format.formats[i] == ColumnFormat::None)
format.throwInvalidFormat("Serialization type for output part rows, rows_before_limit, time, "
"rows_read or bytes_read is not specified", i);
break;
}
}
if (data_idx != 0)
format.throwInvalidFormat("${data} must be the first output part", 0);
/// Parse format string for rows
row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName)
{
return sample.getPositionByName(colName);
});
/// Validate format string for rows
if (row_format.delimiters.size() == 1)
row_format.throwInvalidFormat("No columns specified", 0);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
if (!row_format.format_idx_to_column_idx[i])
row_format.throwInvalidFormat("Cannot skip format field for output, it's a bug.", i);
if (row_format.formats[i] == ColumnFormat::None)
row_format.throwInvalidFormat("Serialization type for file column is not specified", i);
}
}
TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part)
{
if (part == "data")
return OutputPart::Data;
else if (part == "totals")
return OutputPart::Totals;
else if (part == "min")
return OutputPart::ExtremesMin;
else if (part == "max")
return OutputPart::ExtremesMax;
else if (part == "rows")
return OutputPart::Rows;
else if (part == "rows_before_limit")
return OutputPart::RowsBeforeLimit;
else if (part == "time")
return OutputPart::TimeElapsed;
else if (part == "rows_read")
return OutputPart::RowsRead;
else if (part == "bytes_read")
return OutputPart::BytesRead;
else
throw Exception("Unknown output part " + part, ErrorCodes::SYNTAX_ERROR);
}
void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num)
{
size_t columns = row_format.format_idx_to_column_idx.size();
for (size_t j = 0; j < columns; ++j)
{
writeString(row_format.delimiters[j], out);
size_t col_idx = *row_format.format_idx_to_column_idx[j];
serializeField(*chunk.getColumns()[col_idx], *types[col_idx], row_num, row_format.formats[j]);
}
writeString(row_format.delimiters[columns], out);
}
void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format)
{
switch (col_format)
{
case ColumnFormat::Escaped:
type.serializeAsTextEscaped(column, row_num, out, settings);
break;
case ColumnFormat::Quoted:
type.serializeAsTextQuoted(column, row_num, out, settings);
break;
case ColumnFormat::Csv:
type.serializeAsTextCSV(column, row_num, out, settings);
break;
case ColumnFormat::Json:
type.serializeAsTextJSON(column, row_num, out, settings);
break;
case ColumnFormat::Xml:
type.serializeAsTextXML(column, row_num, out, settings);
break;
case ColumnFormat::Raw:
type.serializeAsText(column, row_num, out, settings);
break;
default:
__builtin_unreachable();
}
}
template <typename U, typename V> void TemplateBlockOutputFormat::writeValue(U value, ColumnFormat col_format)
{
auto type = std::make_unique<V>();
auto col = type->createColumn();
col->insert(value);
serializeField(*col, *type, 0, col_format);
}
void TemplateBlockOutputFormat::consume(Chunk chunk)
{
doWritePrefix();
size_t rows = chunk.getNumRows();
for (size_t i = 0; i < rows; ++i)
{
if (row_count)
writeString(settings.template_settings.row_between_delimiter, out);
writeRow(chunk, i);
++row_count;
}
}
void TemplateBlockOutputFormat::doWritePrefix()
{
if (need_write_prefix)
{
writeString(format.delimiters.front(), out);
need_write_prefix = false;
}
}
void TemplateBlockOutputFormat::finalize()
{
if (finalized)
return;
doWritePrefix();
size_t parts = format.format_idx_to_column_idx.size();
for (size_t i = 0; i < parts; ++i)
{
auto type = std::make_shared<DataTypeUInt64>();
ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp"));
switch (static_cast<OutputPart>(*format.format_idx_to_column_idx[i]))
{
case OutputPart::Totals:
if (!totals)
format.throwInvalidFormat("Cannot print totals for this request", i);
writeRow(totals, 0);
break;
case OutputPart::ExtremesMin:
if (!extremes)
format.throwInvalidFormat("Cannot print extremes for this request", i);
writeRow(extremes, 0);
break;
case OutputPart::ExtremesMax:
if (!extremes)
format.throwInvalidFormat("Cannot print extremes for this request", i);
writeRow(extremes, 1);
break;
case OutputPart::Rows:
writeValue<size_t, DataTypeUInt64>(row_count, format.formats[i]);
break;
case OutputPart::RowsBeforeLimit:
if (!rows_before_limit_set)
format.throwInvalidFormat("Cannot print rows_before_limit for this request", i);
writeValue<size_t, DataTypeUInt64>(rows_before_limit, format.formats[i]);
break;
case OutputPart::TimeElapsed:
writeValue<double, DataTypeFloat64>(watch.elapsedSeconds(), format.formats[i]);
break;
case OutputPart::RowsRead:
writeValue<size_t, DataTypeUInt64>(progress.read_rows.load(), format.formats[i]);
break;
case OutputPart::BytesRead:
writeValue<size_t, DataTypeUInt64>(progress.read_bytes.load(), format.formats[i]);
break;
default:
break;
}
writeString(format.delimiters[i + 1], out);
}
finalized = true;
}
void registerOutputFormatProcessorTemplate(FormatFactory & factory)
{
factory.registerOutputFormatProcessor("Template", [](
WriteBuffer & buf,
const Block & sample,
const Context &,
FormatFactory::WriteCallback,
const FormatSettings & settings)
{
return std::make_shared<TemplateBlockOutputFormat>(sample, buf, settings);
});
}
}

View File

@ -0,0 +1,68 @@
#pragma once
#include <Common/Stopwatch.h>
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <Processors/Formats/IOutputFormat.h>
#include <Formats/ParsedTemplateFormatString.h>
namespace DB
{
class TemplateBlockOutputFormat : public IOutputFormat
{
using ColumnFormat = ParsedTemplateFormatString::ColumnFormat;
public:
TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_);
String getName() const override { return "TemplateBlockOutputFormat"; }
void doWritePrefix() override;
void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; rows_before_limit_set = true; }
void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); }
protected:
void consume(Chunk chunk) override;
void consumeTotals(Chunk chunk) override { totals = std::move(chunk); }
void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); }
void finalize() override;
enum class OutputPart : size_t
{
Data,
Totals,
ExtremesMin,
ExtremesMax,
Rows,
RowsBeforeLimit,
TimeElapsed,
RowsRead,
BytesRead
};
OutputPart stringToOutputPart(const String & part);
void writeRow(const Chunk & chunk, size_t row_num);
void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format);
template <typename U, typename V> void writeValue(U value, ColumnFormat col_format);
protected:
const FormatSettings settings;
DataTypes types;
ParsedTemplateFormatString format;
ParsedTemplateFormatString row_format;
size_t rows_before_limit = 0;
bool rows_before_limit_set = false;
Chunk totals;
Chunk extremes;
Progress progress;
Stopwatch watch;
size_t row_count = 0;
bool need_write_prefix = true;
};
}

View File

@ -0,0 +1,520 @@
#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
#include <IO/Operators.h>
#include <DataTypes/DataTypeNothing.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int CANNOT_READ_ALL_DATA;
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
extern const int CANNOT_PARSE_QUOTED_STRING;
extern const int SYNTAX_ERROR;
}
TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
const FormatSettings & settings_, bool ignore_spaces_)
: RowInputFormatWithDiagnosticInfo(header_, buf, params_), buf(in_), data_types(header_.getDataTypes()),
settings(settings_), ignore_spaces(ignore_spaces_)
{
/// Parse format string for whole input
static const String default_format("${data}");
const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format;
format = ParsedTemplateFormatString(format_str, [&](const String & partName) -> std::optional<size_t>
{
if (partName == "data")
return 0;
else if (partName.empty()) /// For skipping some values in prefix and suffix
#if !__clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
/// Suppress false-positive warning (bug in GCC 9: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86465)
return {};
#if !__clang__
#pragma GCC diagnostic pop
#endif
throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR);
});
/// Validate format string for whole input
bool has_data = false;
for (size_t i = 0; i < format.columnsCount(); ++i)
{
if (format.format_idx_to_column_idx[i])
{
if (has_data)
format.throwInvalidFormat("${data} can occur only once", i);
if (format.formats[i] != ColumnFormat::None)
format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
has_data = true;
format_data_idx = i;
}
else
{
if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw)
format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
}
}
/// Parse format string for rows
row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional<size_t>
{
if (colName.empty())
#if !__clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
return {};
#if !__clang__
#pragma GCC diagnostic pop
#endif
return header_.getPositionByName(colName);
});
/// Validate format string for rows
std::vector<UInt8> column_in_format(header_.columns(), false);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw)
row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
if (row_format.format_idx_to_column_idx[i])
{
if (row_format.formats[i] == ColumnFormat::None)
row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i);
size_t col_idx = *row_format.format_idx_to_column_idx[i];
if (column_in_format[col_idx])
row_format.throwInvalidFormat("Duplicate column", i);
column_in_format[col_idx] = true;
}
}
}
void TemplateRowInputFormat::readPrefix()
{
size_t last_successfully_parsed_idx = 0;
try
{
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
}
catch (Exception & e)
{
format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
}
}
/// Asserts delimiters and skips fields in prefix or suffix.
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
/// (most likely false will be returned on first call of checkString(...))
template <typename ReturnType>
ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], buf)))
return ReturnType(false);
}
while (input_part_beg < input_part_end)
{
skipSpaces();
if constexpr (throw_exception)
skipField(format.formats[input_part_beg]);
else
{
try
{
skipField(format.formats[input_part_beg]);
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
/// If it's parsing error, then suffix is not found
return ReturnType(false);
}
}
++input_part_beg;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], buf)))
return ReturnType(false);
}
}
if constexpr (!throw_exception)
return ReturnType(true);
}
bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra)
{
/// This function can be called again after it returned false
if (unlikely(end_of_stream))
return false;
skipSpaces();
if (unlikely(checkForSuffix()))
{
end_of_stream = true;
return false;
}
updateDiagnosticInfo();
if (likely(row_num != 1))
assertString(settings.template_settings.row_between_delimiter, buf);
extra.read_columns.assign(columns.size(), false);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
skipSpaces();
assertString(row_format.delimiters[i], buf);
skipSpaces();
if (row_format.format_idx_to_column_idx[i])
{
size_t col_idx = *row_format.format_idx_to_column_idx[i];
deserializeField(*data_types[col_idx], *columns[col_idx], row_format.formats[i]);
extra.read_columns[col_idx] = true;
}
else
skipField(row_format.formats[i]);
}
skipSpaces();
assertString(row_format.delimiters.back(), buf);
for (size_t i = 0; i < columns.size(); ++i)
if (!extra.read_columns[i])
data_types[i]->insertDefaultInto(*columns[i]);
return true;
}
void TemplateRowInputFormat::deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format)
{
try
{
switch (col_format)
{
case ColumnFormat::Escaped:
type.deserializeAsTextEscaped(column, buf, settings);
break;
case ColumnFormat::Quoted:
type.deserializeAsTextQuoted(column, buf, settings);
break;
case ColumnFormat::Csv:
type.deserializeAsTextCSV(column, buf, settings);
break;
case ColumnFormat::Json:
type.deserializeAsTextJSON(column, buf, settings);
break;
default:
__builtin_unreachable();
}
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throwUnexpectedEof();
throw;
}
}
void TemplateRowInputFormat::skipField(TemplateRowInputFormat::ColumnFormat col_format)
{
String tmp;
constexpr const char * field_name = "<SKIPPED COLUMN>";
constexpr size_t field_name_len = 16;
try
{
switch (col_format)
{
case ColumnFormat::None:
/// Empty field, just skip spaces
break;
case ColumnFormat::Escaped:
readEscapedString(tmp, buf);
break;
case ColumnFormat::Quoted:
readQuotedString(tmp, buf);
break;
case ColumnFormat::Csv:
readCSVString(tmp, buf, settings.csv);
break;
case ColumnFormat::Json:
skipJSONField(buf, StringRef(field_name, field_name_len));
break;
default:
__builtin_unreachable();
}
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throwUnexpectedEof();
throw;
}
}
/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
/// Otherwise returns false
bool TemplateRowInputFormat::checkForSuffix()
{
PeekableReadBufferCheckpoint checkpoint{buf};
bool suffix_found = false;
size_t last_successfully_parsed_idx = format_data_idx + 1;
try
{
suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
}
if (unlikely(suffix_found))
{
skipSpaces();
if (buf.eof())
return true;
}
buf.rollbackToCheckpoint();
return false;
}
bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
{
out << "Suffix does not match: ";
size_t last_successfully_parsed_idx = format_data_idx + 1;
const ReadBuffer::Position row_begin_pos = buf.position();
bool caught = false;
try
{
PeekableReadBufferCheckpoint checkpoint{buf, true};
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
}
catch (Exception & e)
{
out << e.message() << " Near column " << last_successfully_parsed_idx;
caught = true;
}
if (!caught)
{
out << " There is some data after suffix (EOF expected, got ";
verbosePrintString(buf.position(), std::min(buf.buffer().end(), buf.position() + 16), out);
out << "). ";
}
out << " Format string (from format_schema): \n" << format.dump() << "\n";
if (row_begin_pos != buf.position())
{
/// Pointers to buffer memory were invalidated during checking for suffix
out << "\nCannot print more diagnostic info.";
return false;
}
out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
out << "\nTrying to parse next row, because suffix does not match:\n";
try
{
if (likely(row_num != 1))
assertString(settings.template_settings.row_between_delimiter, buf);
}
catch (const DB::Exception &)
{
writeErrorStringForWrongDelimiter(out, "delimiter between rows", settings.template_settings.row_between_delimiter);
return false;
}
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
skipSpaces();
try
{
assertString(row_format.delimiters[i], buf);
}
catch (const DB::Exception &)
{
writeErrorStringForWrongDelimiter(out, "delimiter before field " + std::to_string(i), row_format.delimiters[i]);
return false;
}
skipSpaces();
if (row_format.format_idx_to_column_idx[i])
{
auto & header = getPort().getHeader();
size_t col_idx = *row_format.format_idx_to_column_idx[i];
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx],
*columns[col_idx], out, i))
{
out << "Maybe it's not possible to deserialize field " + std::to_string(i) +
" as " + ParsedTemplateFormatString::formatToString(row_format.formats[i]);
return false;
}
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, i))
return false;
}
}
skipSpaces();
try
{
assertString(row_format.delimiters.back(), buf);
}
catch (const DB::Exception &)
{
writeErrorStringForWrongDelimiter(out, "delimiter after last field", row_format.delimiters.back());
return false;
}
return true;
}
void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim)
{
out << "ERROR: There is no " << description << ": expected ";
verbosePrintString(delim.data(), delim.data() + delim.size(), out);
out << ", got ";
if (buf.eof())
out << "<End of stream>";
else
verbosePrintString(buf.position(), std::min(buf.position() + delim.size() + 10, buf.buffer().end()), out);
out << '\n';
}
void TemplateRowInputFormat::tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos)
{
prev_pos = buf.position();
if (row_format.format_idx_to_column_idx[file_column])
deserializeField(*type, column, row_format.formats[file_column]);
else
skipField(row_format.formats[file_column]);
curr_pos = buf.position();
}
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
{
/// Garbage will be considered as wrong delimiter
return false;
}
bool TemplateRowInputFormat::allowSyncAfterError() const
{
return !row_format.delimiters.back().empty() || !settings.template_settings.row_between_delimiter.empty();
}
void TemplateRowInputFormat::syncAfterError()
{
bool at_beginning_of_row_or_eof = false;
while (!at_beginning_of_row_or_eof)
{
skipToNextDelimiterOrEof(row_format.delimiters.back());
if (buf.eof())
{
end_of_stream = true;
return;
}
buf.ignore(row_format.delimiters.back().size());
skipSpaces();
if (checkForSuffix())
return;
bool last_delimiter_in_row_found = !row_format.delimiters.back().empty();
if (last_delimiter_in_row_found && checkString(settings.template_settings.row_between_delimiter, buf))
at_beginning_of_row_or_eof = true;
else
skipToNextDelimiterOrEof(settings.template_settings.row_between_delimiter);
if (buf.eof())
at_beginning_of_row_or_eof = end_of_stream = true;
}
/// It can happen that buf.position() is not at the beginning of row
/// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
/// It will cause another parsing error.
}
/// Searches for delimiter in input stream and sets buffer position to the beginning of delimiter (if found) or EOF (if not)
void TemplateRowInputFormat::skipToNextDelimiterOrEof(const String & delimiter)
{
if (delimiter.empty())
return;
while (!buf.eof())
{
void * pos = memchr(buf.position(), delimiter[0], buf.available());
if (!pos)
{
buf.position() += buf.available();
continue;
}
buf.position() = static_cast<ReadBuffer::Position>(pos);
PeekableReadBufferCheckpoint checkpoint{buf};
if (checkString(delimiter, buf))
return;
buf.rollbackToCheckpoint();
++buf.position();
}
}
void TemplateRowInputFormat::throwUnexpectedEof()
{
throw Exception("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
ErrorCodes::CANNOT_READ_ALL_DATA);
}
void registerInputFormatProcessorTemplate(FormatFactory & factory)
{
for (bool ignore_spaces : {false, true})
{
factory.registerInputFormatProcessor(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=](
ReadBuffer & buf,
const Block & sample,
const Context &,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TemplateRowInputFormat>(sample, buf, params, settings, ignore_spaces);
});
}
}
}

View File

@ -0,0 +1,61 @@
#pragma once
#include <Core/Block.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Formats/FormatSettings.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <IO/ReadHelpers.h>
#include <IO/PeekableReadBuffer.h>
namespace DB
{
class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo
{
using ColumnFormat = ParsedTemplateFormatString::ColumnFormat;
public:
TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
const FormatSettings & settings_, bool ignore_spaces_);
String getName() const override { return "TemplateRowInputFormat"; }
bool readRow(MutableColumns & columns, RowReadExtension & extra) override;
void readPrefix() override;
bool allowSyncAfterError() const override;
void syncAfterError() override;
private:
void deserializeField(const IDataType & type, IColumn & column, ColumnFormat col_format);
void skipField(ColumnFormat col_format);
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); }
template <typename ReturnType = void>
ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end);
bool checkForSuffix();
[[noreturn]] void throwUnexpectedEof();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, ReadBuffer::Position & prev_pos,
ReadBuffer::Position & curr_pos) override;
bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override;
void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim);
void skipToNextDelimiterOrEof(const String & delimiter);
private:
PeekableReadBuffer buf;
DataTypes data_types;
FormatSettings settings;
ParsedTemplateFormatString format;
ParsedTemplateFormatString row_format;
const bool ignore_spaces;
size_t format_data_idx;
bool end_of_stream = false;
};
}

View File

@ -0,0 +1,167 @@
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Formats/verbosePrintString.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
DB::RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_)
: IRowInputFormat(header_, in_, params_)
{
}
void DB::RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
{
++row_num;
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
bytes_read_at_start_of_buffer_on_current_row = in.count() - in.offset();
offset_of_prev_row = offset_of_current_row;
offset_of_current_row = in.offset();
}
String DB::RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
{
if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed.
return {};
WriteBufferFromOwnString out;
auto & header = getPort().getHeader();
MutableColumns columns = header.cloneEmptyColumns();
/// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
size_t bytes_read_at_start_of_buffer = in.count() - in.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
return out.str();
}
max_length_of_column_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = header.safeGetByPosition(i).name.size();
max_length_of_data_type_name = 0;
for (size_t i = 0; i < header.columns(); ++i)
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
if (offset_of_prev_row <= in.buffer().size())
{
in.position() = in.buffer().begin() + offset_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(columns, out))
return out.str();
}
else
{
if (in.buffer().size() < offset_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return out.str();
}
in.position() = in.buffer().begin() + offset_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(columns, out);
out << "\n";
return out.str();
}
bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name,
const DataTypePtr & type,
IColumn & column,
WriteBuffer & out,
size_t file_column)
{
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
<< "name: " << alignedName(col_name, max_length_of_column_name)
<< "type: " << alignedName(type->getName(), max_length_of_data_type_name);
auto prev_position = in.position();
auto curr_position = in.position();
std::exception_ptr exception;
try
{
tryDeserializeFiled(type, column, file_column, prev_position, curr_position);
}
catch (...)
{
exception = std::current_exception();
}
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNativeNumber(type) || isDateOrDateTime(type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
out << " is not like " << type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (type->haveMaximumSizeOfValue())
{
if (isGarbageAfterField(file_column, curr_position))
{
out << "ERROR: garbage after " << type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
out << "\n";
if (type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
return true;
}
String RowInputFormatWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const
{
size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0;
return name + ", " + std::string(spaces_count, ' ');
}
}

View File

@ -0,0 +1,46 @@
#pragma once
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <IO/ReadBuffer.h>
#include <limits>
namespace DB
{
class RowInputFormatWithDiagnosticInfo : public IRowInputFormat
{
public:
RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_);
String getDiagnosticInfo() override;
protected:
void updateDiagnosticInfo();
bool deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column,
WriteBuffer & out, size_t file_column);
String alignedName(const String & name, size_t max_length) const;
virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0;
virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column,
ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) = 0;
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
/// For convenient diagnostics in case of an error.
size_t row_num = 0;
private:
/// How many bytes were read, not counting those still in the buffer.
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
size_t offset_of_current_row = std::numeric_limits<size_t>::max();
size_t offset_of_prev_row = std::numeric_limits<size_t>::max();
/// For alignment of diagnostic info.
size_t max_length_of_column_name = 0;
size_t max_length_of_data_type_name = 0;
};
}

View File

@ -21,34 +21,26 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
header(header_), sync(sync_), skip_offsets(skip_offsets_), header(header_), sync(sync_), skip_offsets(skip_offsets_),
already_written_offset_columns(already_written_offset_columns_) already_written_offset_columns(already_written_offset_columns_)
{ {
serialization_states.reserve(header.columns());
WrittenOffsetColumns tmp_offset_columns;
IDataType::SerializeBinaryBulkSettings settings;
for (const auto & column_name : header.getNames())
{
const auto & col = header.getByName(column_name);
const auto columns = storage.getColumns();
addStreams(part_path, col.name, *col.type, columns.getCodecOrDefault(col.name, codec), 0, skip_offsets);
serialization_states.emplace_back(nullptr);
settings.getter = createStreamGetter(col.name, tmp_offset_columns, false);
col.type->serializeBinaryBulkStatePrefix(settings, serialization_states.back());
}
initSkipIndices();
} }
void MergedColumnOnlyOutputStream::write(const Block & block) void MergedColumnOnlyOutputStream::write(const Block & block)
{ {
if (!initialized)
{
column_streams.clear();
serialization_states.clear();
serialization_states.reserve(header.columns());
WrittenOffsetColumns tmp_offset_columns;
IDataType::SerializeBinaryBulkSettings settings;
for (const auto & column_name : header.getNames())
{
const auto & col = block.getByName(column_name);
const auto columns = storage.getColumns();
addStreams(part_path, col.name, *col.type, columns.getCodecOrDefault(col.name, codec), 0, skip_offsets);
serialization_states.emplace_back(nullptr);
settings.getter = createStreamGetter(col.name, tmp_offset_columns, false);
col.type->serializeBinaryBulkStatePrefix(settings, serialization_states.back());
}
initSkipIndices();
initialized = true;
}
std::set<String> skip_indexes_column_names_set; std::set<String> skip_indexes_column_names_set;
for (const auto & index : skip_indices) for (const auto & index : skip_indices)
std::copy(index->columns.cbegin(), index->columns.cend(), std::copy(index->columns.cbegin(), index->columns.cend(),
@ -68,7 +60,6 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
if (!rows) if (!rows)
return; return;
size_t new_index_offset = 0; size_t new_index_offset = 0;
size_t new_current_mark = 0; size_t new_current_mark = 0;
WrittenOffsetColumns offset_columns = already_written_offset_columns; WrittenOffsetColumns offset_columns = already_written_offset_columns;
@ -106,7 +97,8 @@ MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndG
serialize_settings.getter = createStreamGetter(column.name, already_written_offset_columns, skip_offsets); serialize_settings.getter = createStreamGetter(column.name, already_written_offset_columns, skip_offsets);
column.type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]); column.type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]);
if (with_final_mark) /// We wrote at least one row
if (with_final_mark && (index_offset != 0 || current_mark != 0))
writeFinalMark(column.name, column.type, offset_columns, skip_offsets, serialize_settings.path); writeFinalMark(column.name, column.type, offset_columns, skip_offsets, serialize_settings.path);
} }
@ -125,7 +117,6 @@ MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndG
column_streams.clear(); column_streams.clear();
serialization_states.clear(); serialization_states.clear();
initialized = false;
return checksums; return checksums;
} }

View File

@ -28,7 +28,6 @@ public:
private: private:
Block header; Block header;
bool initialized = false;
bool sync; bool sync;
bool skip_offsets; bool skip_offsets;

View File

@ -34,7 +34,8 @@ set the following environment variables:
### Running with runner script ### Running with runner script
The only requirement is fresh docker configured docker. The only requirement is fresh configured docker and
docker pull yandex/clickhouse-integration-tests-runner
Notes: Notes:
* If you want to run integration tests without `sudo` you have to add your user to docker group `sudo usermod -aG docker $USER`. [More information](https://docs.docker.com/install/linux/linux-postinstall/) about docker configuration. * If you want to run integration tests without `sudo` you have to add your user to docker group `sudo usermod -aG docker $USER`. [More information](https://docs.docker.com/install/linux/linux-postinstall/) about docker configuration.

View File

@ -723,7 +723,8 @@ class ClickHouseInstance:
os.mkdir(config_d_dir) os.mkdir(config_d_dir)
os.mkdir(users_d_dir) os.mkdir(users_d_dir)
shutil.copy(p.join(HELPERS_DIR, 'common_instance_config.xml'), config_d_dir) # The file is named with 0_ prefix to be processed before other configuration overloads.
shutil.copy(p.join(HELPERS_DIR, '0_common_instance_config.xml'), config_d_dir)
# Generate and write macros file # Generate and write macros file
macros = self.macros.copy() macros = self.macros.copy()

View File

@ -1,4 +0,0 @@
*
!.gitignore
!source.tsv
!dictionary_preset*

View File

@ -1,411 +0,0 @@
import pytest
import os
import time
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV, assert_eq_with_retry
from generate_dictionaries import generate_structure, generate_dictionaries, DictionaryTestTable
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
cluster = None
instance = None
test_table = None
def get_status(dictionary_name):
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
def get_last_exception(dictionary_name):
return instance.query("SELECT last_exception FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n").replace("\\'", "'")
def get_loading_start_time(dictionary_name):
s = instance.query("SELECT loading_start_time FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
if s == "0000-00-00 00:00:00":
return None
return time.strptime(s, "%Y-%m-%d %H:%M:%S")
def get_loading_duration(dictionary_name):
return float(instance.query("SELECT loading_duration FROM system.dictionaries WHERE name='" + dictionary_name + "'"))
def replace_in_file_in_container(file_name, what, replace_with):
instance.exec_in_container('sed -i "s/' + what + '/' + replace_with + '/g" ' + file_name)
def setup_module(module):
global cluster
global instance
global test_table
structure = generate_structure()
dictionary_files = generate_dictionaries(os.path.join(SCRIPT_DIR, 'configs/dictionaries'), structure)
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
instance = cluster.add_instance('instance', main_configs=dictionary_files)
test_table = DictionaryTestTable(os.path.join(SCRIPT_DIR, 'configs/dictionaries/source.tsv'))
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
instance.query("CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary")
test_table.create_clickhouse_source(instance)
for line in TSV(instance.query('select name from system.dictionaries')).lines:
print line,
# Create table `test.small_dict_source`
instance.query('''
drop table if exists test.small_dict_source;
create table test.small_dict_source (id UInt64, a String, b Int32, c Float64) engine=Log;
insert into test.small_dict_source values (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);
''')
yield cluster
finally:
cluster.shutdown()
@pytest.fixture(params=[
# name, keys, use_parent
('clickhouse_hashed', ('id',), True),
('clickhouse_flat', ('id',), True),
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
],
ids=['clickhouse_hashed', 'clickhouse_flat',
'clickhouse_complex_integers_key_hashed',
'clickhouse_complex_mixed_key_hashed',
'clickhouse_range_hashed']
)
def dictionary_structure(started_cluster, request):
return request.param
def test_select_all(dictionary_structure):
name, keys, use_parent = dictionary_structure
query = instance.query
structure = test_table.get_structure_for_keys(keys, use_parent)
query('''
DROP TABLE IF EXISTS test.{0}
'''.format(name))
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
TSV(query(create_query))
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
print test_table.process_diff(diff)
assert not diff
@pytest.fixture(params=[
# name, keys, use_parent
('clickhouse_cache', ('id',), True),
('clickhouse_complex_integers_key_cache', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_cache', ('key0_str', 'key1'), False)
],
ids=['clickhouse_cache', 'clickhouse_complex_integers_key_cache', 'clickhouse_complex_mixed_key_cache']
)
def cached_dictionary_structure(started_cluster, request):
return request.param
def test_select_all_from_cached(cached_dictionary_structure):
name, keys, use_parent = cached_dictionary_structure
query = instance.query
structure = test_table.get_structure_for_keys(keys, use_parent)
query('''
DROP TABLE IF EXISTS test.{0}
'''.format(name))
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
TSV(query(create_query))
for i in range(4):
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=False)
print test_table.process_diff(diff)
assert not diff
key = []
for key_name in keys:
if key_name.endswith('str'):
key.append("'" + str(i) + "'")
else:
key.append(str(i))
if len(key) == 1:
key = 'toUInt64(' + str(i) + ')'
else:
key = str('(' + ','.join(key) + ')')
query("select dictGetUInt8('{0}', 'UInt8_', {1})".format(name, key))
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
print test_table.process_diff(diff)
assert not diff
def test_null_value(started_cluster):
query = instance.query
assert TSV(query("select dictGetUInt8('clickhouse_cache', 'UInt8_', toUInt64(12121212))")) == TSV("1")
assert TSV(query("select dictGetString('clickhouse_cache', 'String_', toUInt64(12121212))")) == TSV("implicit-default")
assert TSV(query("select dictGetDate('clickhouse_cache', 'Date_', toUInt64(12121212))")) == TSV("2015-11-25")
# Check, that empty null_value interprets as default value
assert TSV(query("select dictGetUInt64('clickhouse_cache', 'UInt64_', toUInt64(12121212))")) == TSV("0")
assert TSV(query("select dictGetDateTime('clickhouse_cache', 'DateTime_', toUInt64(12121212))")) == TSV("0000-00-00 00:00:00")
def test_dictionary_dependency(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so these dictionary are not loaded.
assert get_status('dep_x') == 'NOT_LOADED'
assert get_status('dep_y') == 'NOT_LOADED'
assert get_status('dep_z') == 'NOT_LOADED'
# Dictionary 'dep_x' depends on 'dep_z', which depends on 'dep_y'.
# So they all should be loaded at once.
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(1))") == "air\n"
assert get_status('dep_x') == 'LOADED'
assert get_status('dep_y') == 'LOADED'
assert get_status('dep_z') == 'LOADED'
# Other dictionaries should work too.
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(1))") == "air\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(1))") == "air\n"
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "YY\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
# Update the source table.
query("insert into test.small_dict_source values (3, 'fire', 30, 8)")
# Wait for dictionaries to be reloaded.
assert_eq_with_retry(instance, "SELECT dictHas('dep_y', toUInt64(3))", "1", sleep_time = 2, retry_count = 10)
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
# dep_x and dep_z are updated only when there `intDiv(count(), 4)` is changed.
query("insert into test.small_dict_source values (4, 'ether', 404, 0.001)")
assert_eq_with_retry(instance, "SELECT dictHas('dep_x', toUInt64(4))", "1", sleep_time = 2, retry_count = 10)
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(4))") == "ether\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ether\n"
def test_reload_while_loading(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status('longload') == "NOT_LOADED"
assert get_loading_duration('longload') == 0
# It's not possible to get a value from the dictionary within 1.0 second, so the following query fails by timeout.
assert query("SELECT dictGetInt32('longload', 'a', toUInt64(5))", timeout = 1, ignore_error = True) == ""
# The dictionary is now loading.
assert get_status('longload') == "LOADING"
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
assert duration > 0
time.sleep(0.5) # Still loading.
assert get_status('longload') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
assert start_time == prev_start_time
assert duration >= prev_duration
# SYSTEM RELOAD DICTIONARY should restart loading.
query("SYSTEM RELOAD DICTIONARY 'longload'")
assert get_status('longload') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
assert start_time > prev_start_time
assert duration < prev_duration
time.sleep(0.5) # Still loading.
assert get_status('longload') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
assert start_time == prev_start_time
assert duration >= prev_duration
# SYSTEM RELOAD DICTIONARIES should restart loading again.
query("SYSTEM RELOAD DICTIONARIES")
assert get_status('longload') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('longload'), get_loading_duration('longload')
assert start_time > prev_start_time
assert duration < prev_duration
# Changing the configuration file should restart loading one more time.
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_longload.xml', 'sleep 100', 'sleep 0')
time.sleep(5) # Configuration files are reloaded once in 5 seconds.
# This time loading should finish quickly.
assert get_status('longload') == "LOADED"
assert query("SELECT dictGetInt32('longload', 'a', toUInt64(5))") == "6\n"
def test_reload_after_loading(started_cluster):
query = instance.query
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "8\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
# Change the dictionaries' data.
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '8', '81')
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '10', '101')
# SYSTEM RELOAD 'name' reloads only the specified dictionary.
query("SYSTEM RELOAD DICTIONARY 'cmd'")
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "81\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
query("SYSTEM RELOAD DICTIONARY 'file'")
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "81\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "101\n"
# SYSTEM RELOAD DICTIONARIES reloads all loaded dictionaries.
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '81', '82')
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '101', '102')
query("SYSTEM RELOAD DICTIONARIES")
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "82\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n"
# Configuration files are reloaded and lifetimes are checked automatically once in 5 seconds.
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_cmd.xml', '82', '83')
replace_in_file_in_container('/etc/clickhouse-server/config.d/dictionary_preset_file.txt', '102', '103')
time.sleep(5)
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n"
assert query("SELECT dictGetInt32('cmd', 'a', toUInt64(7))") == "83\n"
def test_reload_after_fail_by_system_reload(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status("no_file") == "NOT_LOADED"
# We expect an error because the file source doesn't exist.
expected_error = "No such file"
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
assert get_status("no_file") == "FAILED"
# SYSTEM RELOAD should not change anything now, the status is still FAILED.
query("SYSTEM RELOAD DICTIONARY 'no_file'")
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
assert get_status("no_file") == "FAILED"
# Creating the file source makes the dictionary able to load.
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/dictionary_preset_file.txt"), "/etc/clickhouse-server/config.d/dictionary_preset_no_file.txt")
query("SYSTEM RELOAD DICTIONARY 'no_file'")
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file") == "LOADED"
# Removing the file source should not spoil the loaded dictionary.
instance.exec_in_container("rm /etc/clickhouse-server/config.d/dictionary_preset_no_file.txt")
query("SYSTEM RELOAD DICTIONARY 'no_file'")
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file") == "LOADED"
def test_reload_after_fail_by_timer(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status("no_file_2") == "NOT_LOADED"
# We expect an error because the file source doesn't exist.
expected_error = "No such file"
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
assert get_status("no_file_2") == "FAILED"
# Passed time should not change anything now, the status is still FAILED.
time.sleep(6);
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
assert get_status("no_file_2") == "FAILED"
# Creating the file source makes the dictionary able to load.
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/dictionary_preset_file.txt"), "/etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt")
time.sleep(6);
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file_2") == "LOADED"
# Removing the file source should not spoil the loaded dictionary.
instance.exec_in_container("rm /etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt")
time.sleep(6);
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file_2") == "LOADED"
def test_reload_after_fail_in_cache_dictionary(started_cluster):
query = instance.query
query_and_get_error = instance.query_and_get_error
# Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond.
expected_error = "Table test.xypairs doesn't exist"
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))")
assert get_status("cache_xypairs") == "LOADED"
assert expected_error in get_last_exception("cache_xypairs")
# Create table `test.xypairs`.
query('''
drop table if exists test.xypairs;
create table test.xypairs (x UInt64, y UInt64) engine=Log;
insert into test.xypairs values (1, 56), (3, 78);
''')
# Cache dictionary now works.
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "56", ignore_error=True)
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert get_last_exception("cache_xypairs") == ""
# Drop table `test.xypairs`.
query('drop table if exists test.xypairs')
# Values are cached so we can get them.
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert get_last_exception("cache_xypairs") == ""
# But we can't get a value from the source table which isn't cached.
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
assert expected_error in get_last_exception("cache_xypairs")
# Passed time should not spoil the cache.
time.sleep(5);
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
assert expected_error in get_last_exception("cache_xypairs")
# Create table `test.xypairs` again with changed values.
query('''
drop table if exists test.xypairs;
create table test.xypairs (x UInt64, y UInt64) engine=Log;
insert into test.xypairs values (1, 57), (3, 79);
''')
# The cache dictionary returns new values now.
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "57")
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(3))") == "79"
assert get_last_exception("cache_xypairs") == ""

View File

@ -0,0 +1,30 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>10</count>
</logger>
<tcp_port>9000</tcp_port>
<listen_host>127.0.0.1</listen_host>
<openSSL>
<client>
<cacheSessions>true</cacheSessions>
<verificationMode>none</verificationMode>
<invalidCertificateHandler>
<name>AcceptCertificateHandler</name>
</invalidCertificateHandler>
</client>
</openSSL>
<max_concurrent_queries>500</max_concurrent_queries>
<mark_cache_size>5368709120</mark_cache_size>
<path>./clickhouse/</path>
<users_config>users.xml</users_config>
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
</yandex>

View File

@ -8,7 +8,7 @@
<user>default</user> <user>default</user>
<password></password> <password></password>
<db>test</db> <db>test</db>
<table>small_dict_source</table> <table>elements</table>
</clickhouse> </clickhouse>
</source> </source>
<lifetime>5</lifetime> <lifetime>5</lifetime>

View File

@ -0,0 +1,76 @@
import pytest
import os
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import assert_eq_with_retry
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
DICTIONARY_FILES = ['configs/dictionaries/dep_x.xml', 'configs/dictionaries/dep_y.xml', 'configs/dictionaries/dep_z.xml']
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
instance.query('''
CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
CREATE DATABASE IF NOT EXISTS test;
DROP TABLE IF EXISTS test.elements;
CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;
INSERT INTO test.elements VALUES (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);
''')
yield cluster
finally:
cluster.shutdown()
def get_status(dictionary_name):
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
def test_get_data(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so these dictionary are not loaded.
assert get_status('dep_x') == 'NOT_LOADED'
assert get_status('dep_y') == 'NOT_LOADED'
assert get_status('dep_z') == 'NOT_LOADED'
# Dictionary 'dep_x' depends on 'dep_z', which depends on 'dep_y'.
# So they all should be loaded at once.
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(1))") == "air\n"
assert get_status('dep_x') == 'LOADED'
assert get_status('dep_y') == 'LOADED'
assert get_status('dep_z') == 'LOADED'
# Other dictionaries should work too.
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(1))") == "air\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(1))") == "air\n"
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "YY\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
# Update the source table.
query("INSERT INTO test.elements VALUES (3, 'fire', 30, 8)")
# Wait for dictionaries to be reloaded.
assert_eq_with_retry(instance, "SELECT dictHas('dep_y', toUInt64(3))", "1", sleep_time = 2, retry_count = 10)
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "XX\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "ZZ\n"
# dep_x and dep_z are updated only when there `intDiv(count(), 4)` is changed.
query("INSERT INTO test.elements VALUES (4, 'ether', 404, 0.001)")
assert_eq_with_retry(instance, "SELECT dictHas('dep_x', toUInt64(4))", "1", sleep_time = 2, retry_count = 10)
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(3))") == "fire\n"
assert query("SELECT dictGetString('dep_x', 'a', toUInt64(4))") == "ether\n"
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ether\n"

View File

@ -0,0 +1,113 @@
<yandex>
<dictionary>
<name>cache</name>
<source>
<clickhouse>
<host>localhost</host>
<port>9000</port>
<user>default</user>
<password></password>
<db>test</db>
<table>source</table>
</clickhouse>
</source>
<lifetime>0</lifetime>
<layout>
<cache><size_in_cells>128</size_in_cells></cache>
</layout>
<structure>
<id>
<name>id</name>
</id>
<attribute>
<name>UInt8_</name>
<type>UInt8</type>
<null_value>1</null_value>
</attribute>
<attribute>
<name>UInt16_</name>
<type>UInt16</type>
<null_value>1</null_value>
</attribute>
<attribute>
<name>UInt32_</name>
<type>UInt32</type>
<null_value>1</null_value>
</attribute>
<attribute>
<name>UInt64_</name>
<type>UInt64</type>
<null_value></null_value>
</attribute>
<attribute>
<name>Int8_</name>
<type>Int8</type>
<null_value>-1</null_value>
</attribute>
<attribute>
<name>Int16_</name>
<type>Int16</type>
<null_value>-1</null_value>
</attribute>
<attribute>
<name>Int32_</name>
<type>Int32</type>
<null_value>-1</null_value>
</attribute>
<attribute>
<name>Int64_</name>
<type>Int64</type>
<null_value>-1</null_value>
</attribute>
<attribute>
<name>Float32_</name>
<type>Float32</type>
<null_value>2.71828</null_value>
</attribute>
<attribute>
<name>Float64_</name>
<type>Float64</type>
<null_value>2.71828</null_value>
</attribute>
<attribute>
<name>String_</name>
<type>String</type>
<null_value>implicit-default</null_value>
</attribute>
<attribute>
<name>Date_</name>
<type>Date</type>
<null_value>2015-11-25</null_value>
</attribute>
<attribute>
<name>DateTime_</name>
<type>DateTime</type>
<null_value></null_value>
</attribute>
<attribute>
<name>Parent</name>
<type>UInt64</type>
<hierarchical>true</hierarchical>
<null_value>0</null_value>
</attribute>
</structure>
</dictionary>
</yandex>

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -0,0 +1,45 @@
import pytest
import os
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV, assert_eq_with_retry
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
DICTIONARY_FILES = ['configs/dictionaries/cache.xml']
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
instance.query('''
CREATE DATABASE IF NOT EXISTS test;
DROP TABLE IF EXISTS test.source;
CREATE TABLE test.source (id UInt64, key0 UInt8, key0_str String, key1 UInt8,
StartDate Date, EndDate Date,
UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64,
Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64,
Float32_ Float32, Float64_ Float64,
String_ String,
Date_ Date, DateTime_ DateTime, Parent UInt64) ENGINE=Log;
''')
yield cluster
finally:
cluster.shutdown()
def test_null_value(started_cluster):
query = instance.query
assert query("select dictGetUInt8('cache', 'UInt8_', toUInt64(12121212))") == "1\n"
assert query("select dictGetString('cache', 'String_', toUInt64(12121212))") == "implicit-default\n"
assert query("select dictGetDate('cache', 'Date_', toUInt64(12121212))") == "2015-11-25\n"
# Check, that empty null_value interprets as default value
assert query("select dictGetUInt64('cache', 'UInt64_', toUInt64(12121212))") == "0\n"
assert query("select dictGetDateTime('cache', 'DateTime_', toUInt64(12121212))") == "0000-00-00 00:00:00\n"

View File

@ -0,0 +1,30 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>10</count>
</logger>
<tcp_port>9000</tcp_port>
<listen_host>127.0.0.1</listen_host>
<openSSL>
<client>
<cacheSessions>true</cacheSessions>
<verificationMode>none</verificationMode>
<invalidCertificateHandler>
<name>AcceptCertificateHandler</name>
</invalidCertificateHandler>
</client>
</openSSL>
<max_concurrent_queries>500</max_concurrent_queries>
<mark_cache_size>5368709120</mark_cache_size>
<path>./clickhouse/</path>
<users_config>users.xml</users_config>
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
</yandex>

View File

@ -0,0 +1,3 @@
*
!.gitignore
!source.tsv

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -12,13 +12,6 @@ types = [
'Date', 'DateTime' 'Date', 'DateTime'
] ]
explicit_defaults = [
'42', '42', '42', '42',
'-42', '-42', '-42', '-42',
'1.5', '1.6',
"'explicit-default'",
"'2015-01-01'", "'2015-01-01 00:00:00'"
]
implicit_defaults = [ implicit_defaults = [
'1', '1', '1', '', '1', '1', '1', '',
@ -182,9 +175,6 @@ def generate_dictionaries(path, structure):
file_names = [] file_names = []
# Add ready dictionaries.
file_names.extend(glob.glob(os.path.join(path, "*dictionary_preset*")))
# Generate dictionaries. # Generate dictionaries.
for (name, key_idx, has_parent), (source, layout) in zip(structure, sources_and_layouts): for (name, key_idx, has_parent), (source, layout) in zip(structure, sources_and_layouts):
filename = os.path.join(path, 'dictionary_%s.xml' % name) filename = os.path.join(path, 'dictionary_%s.xml' % name)

View File

@ -0,0 +1,122 @@
import pytest
import os
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV, assert_eq_with_retry
from generate_dictionaries import generate_structure, generate_dictionaries, DictionaryTestTable
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
cluster = None
instance = None
test_table = None
def setup_module(module):
global cluster
global instance
global test_table
structure = generate_structure()
dictionary_files = generate_dictionaries(os.path.join(SCRIPT_DIR, 'configs/dictionaries'), structure)
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
instance = cluster.add_instance('instance', main_configs=dictionary_files)
test_table = DictionaryTestTable(os.path.join(SCRIPT_DIR, 'configs/dictionaries/source.tsv'))
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
test_table.create_clickhouse_source(instance)
for line in TSV(instance.query('select name from system.dictionaries')).lines:
print line,
yield cluster
finally:
cluster.shutdown()
@pytest.fixture(params=[
# name, keys, use_parent
('clickhouse_hashed', ('id',), True),
('clickhouse_flat', ('id',), True),
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
],
ids=['clickhouse_hashed', 'clickhouse_flat',
'clickhouse_complex_integers_key_hashed',
'clickhouse_complex_mixed_key_hashed',
'clickhouse_range_hashed']
)
def dictionary_structure(started_cluster, request):
return request.param
def test_select_all(dictionary_structure):
name, keys, use_parent = dictionary_structure
query = instance.query
structure = test_table.get_structure_for_keys(keys, use_parent)
query('''
DROP TABLE IF EXISTS test.{0}
'''.format(name))
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
TSV(query(create_query))
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
print test_table.process_diff(diff)
assert not diff
@pytest.fixture(params=[
# name, keys, use_parent
('clickhouse_cache', ('id',), True),
('clickhouse_complex_integers_key_cache', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_cache', ('key0_str', 'key1'), False)
],
ids=['clickhouse_cache', 'clickhouse_complex_integers_key_cache', 'clickhouse_complex_mixed_key_cache']
)
def cached_dictionary_structure(started_cluster, request):
return request.param
def test_select_all_from_cached(cached_dictionary_structure):
name, keys, use_parent = cached_dictionary_structure
query = instance.query
structure = test_table.get_structure_for_keys(keys, use_parent)
query('''
DROP TABLE IF EXISTS test.{0}
'''.format(name))
create_query = "CREATE TABLE test.{0} ({1}) engine = Dictionary({0})".format(name, structure)
TSV(query(create_query))
for i in range(4):
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=False)
print test_table.process_diff(diff)
assert not diff
key = []
for key_name in keys:
if key_name.endswith('str'):
key.append("'" + str(i) + "'")
else:
key.append(str(i))
if len(key) == 1:
key = 'toUInt64(' + str(i) + ')'
else:
key = str('(' + ','.join(key) + ')')
query("select dictGetUInt8('{0}', 'UInt8_', {1})".format(name, key))
result = TSV(query('select * from test.{0}'.format(name)))
diff = test_table.compare_by_keys(keys, result.lines, use_parent, add_not_found_rows=True)
print test_table.process_diff(diff)
assert not diff

View File

@ -0,0 +1,30 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>10</count>
</logger>
<tcp_port>9000</tcp_port>
<listen_host>127.0.0.1</listen_host>
<openSSL>
<client>
<cacheSessions>true</cacheSessions>
<verificationMode>none</verificationMode>
<invalidCertificateHandler>
<name>AcceptCertificateHandler</name>
</invalidCertificateHandler>
</client>
</openSSL>
<max_concurrent_queries>500</max_concurrent_queries>
<mark_cache_size>5368709120</mark_cache_size>
<path>./clickhouse/</path>
<users_config>users.xml</users_config>
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
</yandex>

View File

@ -1,7 +1,7 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<yandex> <yandex>
<dictionary> <dictionary>
<name>cmd</name> <name>executable</name>
<source> <source>
<executable> <executable>
<command>echo '7\t8';</command> <command>echo '7\t8';</command>

View File

@ -4,7 +4,7 @@
<name>file</name> <name>file</name>
<source> <source>
<file> <file>
<path>/etc/clickhouse-server/config.d/dictionary_preset_file.txt</path> <path>/etc/clickhouse-server/config.d/file.txt</path>
<format>TabSeparated</format> <format>TabSeparated</format>
</file> </file>
</source> </source>
@ -21,7 +21,7 @@
<name>no_file</name> <name>no_file</name>
<source> <source>
<file> <file>
<path>/etc/clickhouse-server/config.d/dictionary_preset_no_file.txt</path> <path>/etc/clickhouse-server/config.d/no_file.txt</path>
<format>TabSeparated</format> <format>TabSeparated</format>
</file> </file>
</source> </source>
@ -38,7 +38,7 @@
<name>no_file_2</name> <name>no_file_2</name>
<source> <source>
<file> <file>
<path>/etc/clickhouse-server/config.d/dictionary_preset_no_file_2.txt</path> <path>/etc/clickhouse-server/config.d/no_file_2.txt</path>
<format>TabSeparated</format> <format>TabSeparated</format>
</file> </file>
</source> </source>

View File

@ -1,7 +1,7 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<yandex> <yandex>
<dictionary> <dictionary>
<name>longload</name> <name>slow</name>
<source> <source>
<executable> <executable>
<command>sleep 100 &amp;&amp; echo '5\t6';</command> <command>sleep 100 &amp;&amp; echo '5\t6';</command>

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -0,0 +1,246 @@
import pytest
import os
import time
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import assert_eq_with_retry
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
DICTIONARY_FILES = ['configs/dictionaries/cache_xypairs.xml', 'configs/dictionaries/executable.xml', 'configs/dictionaries/file.xml', 'configs/dictionaries/file.txt', 'configs/dictionaries/slow.xml']
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
instance = cluster.add_instance('instance', main_configs=DICTIONARY_FILES)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
instance.query("CREATE DATABASE IF NOT EXISTS test")
yield cluster
finally:
cluster.shutdown()
def get_status(dictionary_name):
return instance.query("SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
def get_last_exception(dictionary_name):
return instance.query("SELECT last_exception FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n").replace("\\'", "'")
def get_loading_start_time(dictionary_name):
s = instance.query("SELECT loading_start_time FROM system.dictionaries WHERE name='" + dictionary_name + "'").rstrip("\n")
if s == "0000-00-00 00:00:00":
return None
return time.strptime(s, "%Y-%m-%d %H:%M:%S")
def get_loading_duration(dictionary_name):
return float(instance.query("SELECT loading_duration FROM system.dictionaries WHERE name='" + dictionary_name + "'"))
def replace_in_file_in_container(file_name, what, replace_with):
instance.exec_in_container('sed -i "s/' + what + '/' + replace_with + '/g" ' + file_name)
def test_reload_while_loading(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status('slow') == "NOT_LOADED"
assert get_loading_duration('slow') == 0
# It's not possible to get a value from the dictionary within 1.0 second, so the following query fails by timeout.
assert query("SELECT dictGetInt32('slow', 'a', toUInt64(5))", timeout = 1, ignore_error = True) == ""
# The dictionary is now loading.
assert get_status('slow') == "LOADING"
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
assert duration > 0
time.sleep(0.5) # Still loading.
assert get_status('slow') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
assert start_time == prev_start_time
assert duration >= prev_duration
# SYSTEM RELOAD DICTIONARY should restart loading.
query("SYSTEM RELOAD DICTIONARY 'slow'")
assert get_status('slow') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
assert start_time > prev_start_time
assert duration < prev_duration
time.sleep(0.5) # Still loading.
assert get_status('slow') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
assert start_time == prev_start_time
assert duration >= prev_duration
# SYSTEM RELOAD DICTIONARIES should restart loading again.
query("SYSTEM RELOAD DICTIONARIES")
assert get_status('slow') == "LOADING"
prev_start_time, prev_duration = start_time, duration
start_time, duration = get_loading_start_time('slow'), get_loading_duration('slow')
assert start_time > prev_start_time
assert duration < prev_duration
# Changing the configuration file should restart loading one more time.
replace_in_file_in_container('/etc/clickhouse-server/config.d/slow.xml', 'sleep 100', 'sleep 0')
time.sleep(5) # Configuration files are reloaded once in 5 seconds.
# This time loading should finish quickly.
assert get_status('slow') == "LOADED"
assert query("SELECT dictGetInt32('slow', 'a', toUInt64(5))") == "6\n"
def test_reload_after_loading(started_cluster):
query = instance.query
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "8\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
# Change the dictionaries' data.
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '8', '81')
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '10', '101')
# SYSTEM RELOAD 'name' reloads only the specified dictionary.
query("SYSTEM RELOAD DICTIONARY 'executable'")
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "81\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n"
query("SYSTEM RELOAD DICTIONARY 'file'")
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "81\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "101\n"
# SYSTEM RELOAD DICTIONARIES reloads all loaded dictionaries.
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82')
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102')
query("SYSTEM RELOAD DICTIONARIES")
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "82\n"
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n"
# Configuration files are reloaded and lifetimes are checked automatically once in 5 seconds.
replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '82', '83')
replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '102', '103')
time.sleep(5)
assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n"
assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "83\n"
def test_reload_after_fail_by_system_reload(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status("no_file") == "NOT_LOADED"
# We expect an error because the file source doesn't exist.
expected_error = "No such file"
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
assert get_status("no_file") == "FAILED"
# SYSTEM RELOAD should not change anything now, the status is still FAILED.
query("SYSTEM RELOAD DICTIONARY 'no_file'")
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file', 'a', toUInt64(9))")
assert get_status("no_file") == "FAILED"
# Creating the file source makes the dictionary able to load.
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/config.d/no_file.txt")
query("SYSTEM RELOAD DICTIONARY 'no_file'")
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file") == "LOADED"
# Removing the file source should not spoil the loaded dictionary.
instance.exec_in_container("rm /etc/clickhouse-server/config.d/no_file.txt")
query("SYSTEM RELOAD DICTIONARY 'no_file'")
query("SELECT dictGetInt32('no_file', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file") == "LOADED"
def test_reload_after_fail_by_timer(started_cluster):
query = instance.query
# dictionaries_lazy_load == false, so this dictionary is not loaded.
assert get_status("no_file_2") == "NOT_LOADED"
# We expect an error because the file source doesn't exist.
expected_error = "No such file"
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
assert get_status("no_file_2") == "FAILED"
# Passed time should not change anything now, the status is still FAILED.
time.sleep(6);
assert expected_error in instance.query_and_get_error("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))")
assert get_status("no_file_2") == "FAILED"
# Creating the file source makes the dictionary able to load.
instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/config.d/no_file_2.txt")
time.sleep(6);
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file_2") == "LOADED"
# Removing the file source should not spoil the loaded dictionary.
instance.exec_in_container("rm /etc/clickhouse-server/config.d/no_file_2.txt")
time.sleep(6);
query("SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))") == "10\n"
assert get_status("no_file_2") == "LOADED"
def test_reload_after_fail_in_cache_dictionary(started_cluster):
query = instance.query
query_and_get_error = instance.query_and_get_error
# Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond.
expected_error = "Table test.xypairs doesn't exist"
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))")
assert get_status("cache_xypairs") == "LOADED"
assert expected_error in get_last_exception("cache_xypairs")
# Create table `test.xypairs`.
query('''
DROP TABLE IF EXISTS test.xypairs;
CREATE TABLE test.xypairs (x UInt64, y UInt64) ENGINE=Log;
INSERT INTO test.xypairs VALUES (1, 56), (3, 78);
''')
# Cache dictionary now works.
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "56", ignore_error=True)
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert get_last_exception("cache_xypairs") == ""
# Drop table `test.xypairs`.
query('DROP TABLE test.xypairs')
# Values are cached so we can get them.
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert get_last_exception("cache_xypairs") == ""
# But we can't get a value from the source table which isn't cached.
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
assert expected_error in get_last_exception("cache_xypairs")
# Passed time should not spoil the cache.
time.sleep(5);
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(1))") == "56"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
assert expected_error in query_and_get_error("SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(3))")
assert expected_error in get_last_exception("cache_xypairs")
# Create table `test.xypairs` again with changed values.
query('''
CREATE TABLE test.xypairs (x UInt64, y UInt64) ENGINE=Log;
INSERT INTO test.xypairs VALUES (1, 57), (3, 79);
''')
# The cache dictionary returns new values now.
assert_eq_with_retry(instance, "SELECT dictGet('cache_xypairs', 'y', toUInt64(1))", "57")
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(2))") == "0"
query("SELECT dictGet('cache_xypairs', 'y', toUInt64(3))") == "79"
assert get_last_exception("cache_xypairs") == ""

View File

@ -0,0 +1,4 @@
<?xml version="1.0"?>
<yandex>
<timezone>America/Los_Angeles</timezone>
</yandex>

View File

@ -0,0 +1,17 @@
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node = cluster.add_instance('node', main_configs=['configs/config.xml'])
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def test_check_timezone_config(start_cluster):
assert node.query("SELECT toDateTime(1111111111)") == "2005-03-17 17:58:31\n"

Some files were not shown because too many files have changed in this diff Show More