From 80410ce7466711b55d1612049452e5250537fc86 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 16 Mar 2015 13:41:58 +0300
Subject: [PATCH 01/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/Core/Defines.h      | 2 +-
 dbms/include/DB/IO/ReadBufferAIO.h  | 4 ++--
 dbms/include/DB/IO/WriteBufferAIO.h | 2 +-
 dbms/src/IO/ReadBufferAIO.cpp       | 5 ++++-
 dbms/src/IO/WriteBufferAIO.cpp      | 5 ++++-
 5 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/dbms/include/DB/Core/Defines.h b/dbms/include/DB/Core/Defines.h
index ced6e2ddfcf..a4d2959e68c 100644
--- a/dbms/include/DB/Core/Defines.h
+++ b/dbms/include/DB/Core/Defines.h
@@ -72,7 +72,7 @@
 #define DBMS_DISTRIBUTED_DIRECTORY_MONITOR_SLEEP_TIME_MS		100
 
 /// Граница, на которых должны быть выровнены блоки для асинхронных файловых операций.
-#define DEFAULT_AIO_FILE_BLOCK_SIZE								512
+#define DEFAULT_AIO_FILE_BLOCK_SIZE								4096
 
 #define ALWAYS_INLINE 	__attribute__((__always_inline__))
 #define NO_INLINE 		__attribute__((__noinline__))
diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index 09578707b3c..e00a5c52585 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -12,7 +12,7 @@
 namespace DB
 {
 
-/** Класс для асинхронной чтения данных.
+/** Класс для асинхронного чтения данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
 class ReadBufferAIO : public BufferWithOwnMemory<ReadBuffer>
@@ -47,7 +47,7 @@ private:
 	std::vector<iocb *> request_ptrs;
 	std::vector<io_event> events;
 
-	AIOContext aio_context;
+	AIOContext aio_context{1};
 
 	const std::string filename;
 
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 732a45ec588..9f84cc94d89 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -49,7 +49,7 @@ private:
 	std::vector<iocb *> request_ptrs;
 	std::vector<io_event> events;
 
-	AIOContext aio_context;
+	AIOContext aio_context{1};
 
 	const std::string filename;
 
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index d47355bdfb2..937f0ccaa51 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -158,15 +158,16 @@ bool ReadBufferAIO::nextImpl()
 	request.aio_buf = reinterpret_cast<UInt64>(fill_buffer.internalBuffer().begin());
 	request.aio_nbytes = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);
 	request.aio_offset = pos_in_file;
-	request.aio_reqprio = 0;
 
 	/// Отправить запрос.
 	while (io_submit(aio_context.ctx, request_ptrs.size(), &request_ptrs[0]) < 0)
+	{
 		if (errno != EINTR)
 		{
 			got_exception = true;
 			throw Exception("Cannot submit request for asynchronous IO on file " + filename, ErrorCodes::AIO_SUBMIT_ERROR);
 		}
+	}
 
 	is_pending_read = true;
 	return true;
@@ -177,11 +178,13 @@ void ReadBufferAIO::waitForAIOCompletion()
 	if (is_pending_read)
 	{
 		while (io_getevents(aio_context.ctx, events.size(), events.size(), &events[0], nullptr) < 0)
+		{
 			if (errno != EINTR)
 			{
 				got_exception = true;
 				throw Exception("Failed to wait for asynchronous IO completion on file " + filename, ErrorCodes::AIO_COMPLETION_ERROR);
 			}
+		}
 
 		is_pending_read = false;
 		off_t bytes_read = events[0].res;
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 7ac26c08b53..3b4a93514eb 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -145,7 +145,6 @@ void WriteBufferAIO::nextImpl()
 	request.aio_buf = reinterpret_cast<UInt64>(flush_buffer.buffer().begin());
 	request.aio_nbytes = flush_buffer.offset();
 	request.aio_offset = pos_in_file;
-	request.aio_reqprio = 0;
 
 	if ((request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
 	{
@@ -155,11 +154,13 @@ void WriteBufferAIO::nextImpl()
 
 	/// Отправить запрос.
 	while (io_submit(aio_context.ctx, request_ptrs.size(), &request_ptrs[0]) < 0)
+	{
 		if (errno != EINTR)
 		{
 			got_exception = true;
 			throw Exception("Cannot submit request for asynchronous IO on file " + filename, ErrorCodes::AIO_SUBMIT_ERROR);
 		}
+	}
 
 	is_pending_write = true;
 }
@@ -169,11 +170,13 @@ void WriteBufferAIO::waitForAIOCompletion()
 	if (is_pending_write)
 	{
 		while (io_getevents(aio_context.ctx, events.size(), events.size(), &events[0], nullptr) < 0)
+		{
 			if (errno != EINTR)
 			{
 				got_exception = true;
 				throw Exception("Failed to wait for asynchronous IO completion on file " + filename, ErrorCodes::AIO_COMPLETION_ERROR);
 			}
+		}
 
 		is_pending_write = false;
 		off_t bytes_written = events[0].res;

From ee22aac2fe23189578cfceb2a77479fab3377611 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 16 Mar 2015 13:49:27 +0300
Subject: [PATCH 02/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/ReadBufferAIO.h  | 4 ++--
 dbms/include/DB/IO/WriteBufferAIO.h | 4 ++--
 dbms/src/IO/ReadBufferAIO.cpp       | 2 +-
 dbms/src/IO/WriteBufferAIO.cpp      | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index e00a5c52585..0630e5f1dc4 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -44,8 +44,8 @@ private:
 	BufferWithOwnMemory<ReadBuffer> fill_buffer;
 
 	iocb request;
-	std::vector<iocb *> request_ptrs;
-	std::vector<io_event> events;
+	std::vector<iocb *> request_ptrs{&request};
+	std::vector<io_event> events{1};
 
 	AIOContext aio_context{1};
 
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 9f84cc94d89..274de538124 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -46,8 +46,8 @@ private:
 	BufferWithOwnMemory<WriteBuffer> flush_buffer;
 
 	iocb request;
-	std::vector<iocb *> request_ptrs;
-	std::vector<io_event> events;
+	std::vector<iocb *> request_ptrs{&request};
+	std::vector<io_event> events{1};
 
 	AIOContext aio_context{1};
 
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 937f0ccaa51..28347f8252c 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -13,7 +13,7 @@ ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_,
 	char * existing_memory_)
 	: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 	fill_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
-	request_ptrs{ &request }, events(1), filename(filename_)
+	filename(filename_)
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
 
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 3b4a93514eb..792b169c423 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -14,7 +14,7 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
 		char * existing_memory_)
 		: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 		flush_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
-		request_ptrs{ &request }, events(1), filename(filename_)
+		filename(filename_)
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
 

From 9c9adf878be21664e88c6be0a5c5941524494634 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 16 Mar 2015 17:56:12 +0300
Subject: [PATCH 03/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/IReadFileOperations.h      | 20 +++++++++++++++++
 dbms/include/DB/IO/IWriteFileOperations.h     | 22 +++++++++++++++++++
 dbms/include/DB/IO/ReadBufferAIO.h            | 11 +++++-----
 dbms/include/DB/IO/ReadBufferFromFile.h       |  2 +-
 dbms/include/DB/IO/ReadBufferFromFileBase.h   | 18 +++++++++++++++
 .../DB/IO/ReadBufferFromFileDescriptor.h      | 13 ++++++-----
 dbms/include/DB/IO/WriteBufferAIO.h           | 15 +++++++------
 dbms/include/DB/IO/WriteBufferFromFileBase.h  | 18 +++++++++++++++
 .../DB/IO/WriteBufferFromFileDescriptor.h     | 20 +++++++++++------
 dbms/src/IO/ReadBufferAIO.cpp                 |  6 ++---
 dbms/src/IO/WriteBufferAIO.cpp                |  2 +-
 11 files changed, 117 insertions(+), 30 deletions(-)
 create mode 100644 dbms/include/DB/IO/IReadFileOperations.h
 create mode 100644 dbms/include/DB/IO/IWriteFileOperations.h
 create mode 100644 dbms/include/DB/IO/ReadBufferFromFileBase.h
 create mode 100644 dbms/include/DB/IO/WriteBufferFromFileBase.h

diff --git a/dbms/include/DB/IO/IReadFileOperations.h b/dbms/include/DB/IO/IReadFileOperations.h
new file mode 100644
index 00000000000..d370494ccda
--- /dev/null
+++ b/dbms/include/DB/IO/IReadFileOperations.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <string>
+#include <fcntl.h>
+#include <sys/types.h>
+
+namespace DB
+{
+
+class IReadFileOperations
+{
+public:
+	virtual ~IReadFileOperations() = default;
+	virtual off_t seek(off_t off, int whence) = 0;
+	virtual off_t getPositionInFile() = 0;
+	virtual std::string getFileName() const noexcept = 0;
+	virtual int getFD() const noexcept = 0;
+};
+
+}
diff --git a/dbms/include/DB/IO/IWriteFileOperations.h b/dbms/include/DB/IO/IWriteFileOperations.h
new file mode 100644
index 00000000000..0e0a0466d5f
--- /dev/null
+++ b/dbms/include/DB/IO/IWriteFileOperations.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+#include <fcntl.h>
+#include <sys/types.h>
+
+namespace DB
+{
+
+class IWriteFileOperations
+{
+public:
+	virtual ~IWriteFileOperations() = default;
+	virtual off_t seek(off_t off, int whence) = 0;
+	virtual off_t getPositionInFile() = 0;
+	virtual void truncate(off_t length) = 0;
+	virtual void sync() = 0;
+	virtual std::string getFileName() const noexcept = 0;
+	virtual int getFD() const noexcept = 0;
+};
+
+}
diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index 0630e5f1dc4..78752273364 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -15,7 +16,7 @@ namespace DB
 /** Класс для асинхронного чтения данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class ReadBufferAIO : public BufferWithOwnMemory<ReadBuffer>
+class ReadBufferAIO : public ReadBufferFromFileBase
 {
 public:
 	ReadBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -26,10 +27,10 @@ public:
 	ReadBufferAIO & operator=(const ReadBufferAIO &) = delete;
 
 	void setMaxBytes(size_t max_bytes_read_);
-	off_t seek(off_t off, int whence = SEEK_SET);
-	off_t getPositionInFile();
-	std::string getFileName() const noexcept { return filename; }
-	int getFD() const noexcept { return fd; }
+	off_t seek(off_t off, int whence) override;
+	off_t getPositionInFile() override;
+	std::string getFileName() const noexcept override { return filename; }
+	int getFD() const noexcept override { return fd; }
 
 private:
 	off_t getPositionInFileRelaxed() const noexcept;
diff --git a/dbms/include/DB/IO/ReadBufferFromFile.h b/dbms/include/DB/IO/ReadBufferFromFile.h
index 69ce0fdfc7f..4ff277ee98a 100644
--- a/dbms/include/DB/IO/ReadBufferFromFile.h
+++ b/dbms/include/DB/IO/ReadBufferFromFile.h
@@ -33,7 +33,7 @@ public:
 		close(fd);
 	}
 
-	virtual std::string getFileName()
+	std::string getFileName() const noexcept override
 	{
 		return file_name;
 	}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileBase.h b/dbms/include/DB/IO/ReadBufferFromFileBase.h
new file mode 100644
index 00000000000..3e8c3f63ca1
--- /dev/null
+++ b/dbms/include/DB/IO/ReadBufferFromFileBase.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <DB/IO/IReadFileOperations.h>
+#include <DB/IO/BufferWithOwnMemory.h>
+#include <DB/IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class ReadBufferFromFileBase : public IReadFileOperations, public BufferWithOwnMemory<ReadBuffer>
+{
+public:
+	ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
+	: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment) {}
+	virtual ~ReadBufferFromFileBase() = default;
+};
+
+}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
index b412055f509..909811f57cc 100644
--- a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
@@ -8,6 +8,7 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
+#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -18,7 +19,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class ReadBufferFromFileDescriptor : public BufferWithOwnMemory<ReadBuffer>
+class ReadBufferFromFileDescriptor : public ReadBufferFromFileBase
 {
 protected:
 	int fd;
@@ -53,22 +54,22 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName()
+	virtual std::string getFileName() const noexcept override
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	ReadBufferFromFileDescriptor(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
+		: ReadBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
 
-	int getFD()
+	int getFD() const noexcept override
 	{
 		return fd;
 	}
 
 	/// Если offset такой маленький, что мы не выйдем за пределы буфера, настоящий seek по файлу не делается.
-	off_t seek(off_t offset, int whence = SEEK_SET)
+	off_t seek(off_t offset, int whence = SEEK_CUR) override
 	{
 		off_t new_pos = offset;
 		if (whence == SEEK_CUR)
@@ -99,7 +100,7 @@ public:
 		}
 	}
 
-	off_t getPositionInFile()
+	off_t getPositionInFile() override
 	{
 		return pos_in_file - (working_buffer.end() - pos);
 	}
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 274de538124..5ce79db30ce 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -14,7 +15,7 @@ namespace DB
 /** Класс для асинхронной записи данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class WriteBufferAIO : public BufferWithOwnMemory<WriteBuffer>
+class WriteBufferAIO : public WriteBufferFromFileBase
 {
 public:
 	WriteBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -24,12 +25,12 @@ public:
 	WriteBufferAIO(const WriteBufferAIO &) = delete;
 	WriteBufferAIO & operator=(const WriteBufferAIO &) = delete;
 
-	off_t seek(off_t off, int whence = SEEK_SET);
-	off_t getPositionInFile();
-	void truncate(off_t length = 0);
-	void sync();
-	std::string getFileName() const noexcept { return filename; }
-	int getFD() const noexcept { return fd; }
+	off_t seek(off_t off, int whence = SEEK_SET) override;
+	off_t getPositionInFile() override;
+	void truncate(off_t length = 0) override;
+	void sync() override;
+	std::string getFileName() const noexcept override { return filename; }
+	int getFD() const noexcept override { return fd; }
 
 private:
 	/// Если в буфере ещё остались данные - запишем их.
diff --git a/dbms/include/DB/IO/WriteBufferFromFileBase.h b/dbms/include/DB/IO/WriteBufferFromFileBase.h
new file mode 100644
index 00000000000..7a6e582e63f
--- /dev/null
+++ b/dbms/include/DB/IO/WriteBufferFromFileBase.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <DB/IO/IWriteFileOperations.h>
+#include <DB/IO/BufferWithOwnMemory.h>
+#include <DB/IO/WriteBuffer.h>
+
+namespace DB
+{
+
+class WriteBufferFromFileBase : public IWriteFileOperations, public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+	WriteBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
+	: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment) {}
+	virtual ~WriteBufferFromFileBase() = default;
+};
+
+}
diff --git a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
index 6dd76e9d642..def6f45c0a8 100644
--- a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
@@ -6,6 +6,7 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
+#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -16,7 +17,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class WriteBufferFromFileDescriptor : public BufferWithOwnMemory<WriteBuffer>
+class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
 {
 protected:
 	int fd;
@@ -40,14 +41,14 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName()
+	virtual std::string getFileName() const noexcept override
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	WriteBufferFromFileDescriptor(int fd_ = -1, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment), fd(fd_) {}
+		: WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
 
 	/** Можно вызывать для инициализации, если нужный fd не был передан в конструктор.
 	  * Менять fd во время работы нельзя.
@@ -69,12 +70,12 @@ public:
 		}
 	}
 
-	int getFD()
+	int getFD() const noexcept override
 	{
 		return fd;
 	}
 
-	off_t seek(off_t offset, int whence = SEEK_SET)
+	off_t seek(off_t offset, int whence = SEEK_SET) override
 	{
 		off_t res = lseek(fd, offset, whence);
 		if (-1 == res)
@@ -82,14 +83,19 @@ public:
 		return res;
 	}
 
-	void truncate(off_t length = 0)
+	off_t getPositionInFile() override
+	{
+		return seek(0, SEEK_CUR);
+	}
+
+	void truncate(off_t length = 0) override
 	{
 		int res = ftruncate(fd, length);
 		if (-1 == res)
 			throwFromErrno("Cannot truncate file " + getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
 	}
 
-	void sync()
+	void sync() override
 	{
 		/// Если в буфере ещё остались данные - запишем их.
 		next();
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 28347f8252c..3cdedbf8d94 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -11,8 +11,8 @@ namespace DB
 
 ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 	char * existing_memory_)
-	: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
-	fill_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
+	: ReadBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+	fill_buffer(BufferWithOwnMemory<ReadBuffer>(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 	filename(filename_)
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
@@ -64,7 +64,7 @@ void ReadBufferAIO::setMaxBytes(size_t max_bytes_read_)
 	max_bytes_read = max_bytes_read_;
 }
 
-off_t ReadBufferAIO::seek(off_t off, int whence)
+off_t ReadBufferAIO::seek(off_t off, int whence = SEEK_CUR)
 {
 	if ((off % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
 		throw Exception("Invalid offset for ReadBufferAIO::seek", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 792b169c423..a4f17bfa3ff 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -12,7 +12,7 @@ namespace DB
 
 WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 		char * existing_memory_)
-		: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+		: WriteBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 		flush_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 		filename(filename_)
 {

From 93ae3550c786db91f888a71059e226798e3d1314 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 16 Mar 2015 17:56:12 +0300
Subject: [PATCH 04/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/IReadFileOperations.h      | 20 +++++++++++++++++
 dbms/include/DB/IO/IWriteFileOperations.h     | 22 +++++++++++++++++++
 dbms/include/DB/IO/ReadBufferAIO.h            | 11 +++++-----
 dbms/include/DB/IO/ReadBufferFromFile.h       |  2 +-
 dbms/include/DB/IO/ReadBufferFromFileBase.h   | 18 +++++++++++++++
 .../DB/IO/ReadBufferFromFileDescriptor.h      | 13 ++++++-----
 dbms/include/DB/IO/WriteBufferAIO.h           | 15 +++++++------
 dbms/include/DB/IO/WriteBufferFromFileBase.h  | 18 +++++++++++++++
 .../DB/IO/WriteBufferFromFileDescriptor.h     | 20 +++++++++++------
 dbms/src/IO/ReadBufferAIO.cpp                 |  6 ++---
 dbms/src/IO/WriteBufferAIO.cpp                |  2 +-
 11 files changed, 117 insertions(+), 30 deletions(-)
 create mode 100644 dbms/include/DB/IO/IReadFileOperations.h
 create mode 100644 dbms/include/DB/IO/IWriteFileOperations.h
 create mode 100644 dbms/include/DB/IO/ReadBufferFromFileBase.h
 create mode 100644 dbms/include/DB/IO/WriteBufferFromFileBase.h

diff --git a/dbms/include/DB/IO/IReadFileOperations.h b/dbms/include/DB/IO/IReadFileOperations.h
new file mode 100644
index 00000000000..d370494ccda
--- /dev/null
+++ b/dbms/include/DB/IO/IReadFileOperations.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <string>
+#include <fcntl.h>
+#include <sys/types.h>
+
+namespace DB
+{
+
+class IReadFileOperations
+{
+public:
+	virtual ~IReadFileOperations() = default;
+	virtual off_t seek(off_t off, int whence) = 0;
+	virtual off_t getPositionInFile() = 0;
+	virtual std::string getFileName() const noexcept = 0;
+	virtual int getFD() const noexcept = 0;
+};
+
+}
diff --git a/dbms/include/DB/IO/IWriteFileOperations.h b/dbms/include/DB/IO/IWriteFileOperations.h
new file mode 100644
index 00000000000..0e0a0466d5f
--- /dev/null
+++ b/dbms/include/DB/IO/IWriteFileOperations.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+#include <fcntl.h>
+#include <sys/types.h>
+
+namespace DB
+{
+
+class IWriteFileOperations
+{
+public:
+	virtual ~IWriteFileOperations() = default;
+	virtual off_t seek(off_t off, int whence) = 0;
+	virtual off_t getPositionInFile() = 0;
+	virtual void truncate(off_t length) = 0;
+	virtual void sync() = 0;
+	virtual std::string getFileName() const noexcept = 0;
+	virtual int getFD() const noexcept = 0;
+};
+
+}
diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index 0630e5f1dc4..78752273364 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -15,7 +16,7 @@ namespace DB
 /** Класс для асинхронного чтения данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class ReadBufferAIO : public BufferWithOwnMemory<ReadBuffer>
+class ReadBufferAIO : public ReadBufferFromFileBase
 {
 public:
 	ReadBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -26,10 +27,10 @@ public:
 	ReadBufferAIO & operator=(const ReadBufferAIO &) = delete;
 
 	void setMaxBytes(size_t max_bytes_read_);
-	off_t seek(off_t off, int whence = SEEK_SET);
-	off_t getPositionInFile();
-	std::string getFileName() const noexcept { return filename; }
-	int getFD() const noexcept { return fd; }
+	off_t seek(off_t off, int whence) override;
+	off_t getPositionInFile() override;
+	std::string getFileName() const noexcept override { return filename; }
+	int getFD() const noexcept override { return fd; }
 
 private:
 	off_t getPositionInFileRelaxed() const noexcept;
diff --git a/dbms/include/DB/IO/ReadBufferFromFile.h b/dbms/include/DB/IO/ReadBufferFromFile.h
index 69ce0fdfc7f..4ff277ee98a 100644
--- a/dbms/include/DB/IO/ReadBufferFromFile.h
+++ b/dbms/include/DB/IO/ReadBufferFromFile.h
@@ -33,7 +33,7 @@ public:
 		close(fd);
 	}
 
-	virtual std::string getFileName()
+	std::string getFileName() const noexcept override
 	{
 		return file_name;
 	}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileBase.h b/dbms/include/DB/IO/ReadBufferFromFileBase.h
new file mode 100644
index 00000000000..3e8c3f63ca1
--- /dev/null
+++ b/dbms/include/DB/IO/ReadBufferFromFileBase.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <DB/IO/IReadFileOperations.h>
+#include <DB/IO/BufferWithOwnMemory.h>
+#include <DB/IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class ReadBufferFromFileBase : public IReadFileOperations, public BufferWithOwnMemory<ReadBuffer>
+{
+public:
+	ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
+	: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment) {}
+	virtual ~ReadBufferFromFileBase() = default;
+};
+
+}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
index b412055f509..909811f57cc 100644
--- a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
@@ -8,6 +8,7 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
+#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -18,7 +19,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class ReadBufferFromFileDescriptor : public BufferWithOwnMemory<ReadBuffer>
+class ReadBufferFromFileDescriptor : public ReadBufferFromFileBase
 {
 protected:
 	int fd;
@@ -53,22 +54,22 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName()
+	virtual std::string getFileName() const noexcept override
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	ReadBufferFromFileDescriptor(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
+		: ReadBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
 
-	int getFD()
+	int getFD() const noexcept override
 	{
 		return fd;
 	}
 
 	/// Если offset такой маленький, что мы не выйдем за пределы буфера, настоящий seek по файлу не делается.
-	off_t seek(off_t offset, int whence = SEEK_SET)
+	off_t seek(off_t offset, int whence = SEEK_CUR) override
 	{
 		off_t new_pos = offset;
 		if (whence == SEEK_CUR)
@@ -99,7 +100,7 @@ public:
 		}
 	}
 
-	off_t getPositionInFile()
+	off_t getPositionInFile() override
 	{
 		return pos_in_file - (working_buffer.end() - pos);
 	}
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 274de538124..5ce79db30ce 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -14,7 +15,7 @@ namespace DB
 /** Класс для асинхронной записи данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class WriteBufferAIO : public BufferWithOwnMemory<WriteBuffer>
+class WriteBufferAIO : public WriteBufferFromFileBase
 {
 public:
 	WriteBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -24,12 +25,12 @@ public:
 	WriteBufferAIO(const WriteBufferAIO &) = delete;
 	WriteBufferAIO & operator=(const WriteBufferAIO &) = delete;
 
-	off_t seek(off_t off, int whence = SEEK_SET);
-	off_t getPositionInFile();
-	void truncate(off_t length = 0);
-	void sync();
-	std::string getFileName() const noexcept { return filename; }
-	int getFD() const noexcept { return fd; }
+	off_t seek(off_t off, int whence = SEEK_SET) override;
+	off_t getPositionInFile() override;
+	void truncate(off_t length = 0) override;
+	void sync() override;
+	std::string getFileName() const noexcept override { return filename; }
+	int getFD() const noexcept override { return fd; }
 
 private:
 	/// Если в буфере ещё остались данные - запишем их.
diff --git a/dbms/include/DB/IO/WriteBufferFromFileBase.h b/dbms/include/DB/IO/WriteBufferFromFileBase.h
new file mode 100644
index 00000000000..7a6e582e63f
--- /dev/null
+++ b/dbms/include/DB/IO/WriteBufferFromFileBase.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <DB/IO/IWriteFileOperations.h>
+#include <DB/IO/BufferWithOwnMemory.h>
+#include <DB/IO/WriteBuffer.h>
+
+namespace DB
+{
+
+class WriteBufferFromFileBase : public IWriteFileOperations, public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+	WriteBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
+	: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment) {}
+	virtual ~WriteBufferFromFileBase() = default;
+};
+
+}
diff --git a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
index 6dd76e9d642..def6f45c0a8 100644
--- a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
@@ -6,6 +6,7 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
+#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -16,7 +17,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class WriteBufferFromFileDescriptor : public BufferWithOwnMemory<WriteBuffer>
+class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
 {
 protected:
 	int fd;
@@ -40,14 +41,14 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName()
+	virtual std::string getFileName() const noexcept override
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	WriteBufferFromFileDescriptor(int fd_ = -1, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment), fd(fd_) {}
+		: WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
 
 	/** Можно вызывать для инициализации, если нужный fd не был передан в конструктор.
 	  * Менять fd во время работы нельзя.
@@ -69,12 +70,12 @@ public:
 		}
 	}
 
-	int getFD()
+	int getFD() const noexcept override
 	{
 		return fd;
 	}
 
-	off_t seek(off_t offset, int whence = SEEK_SET)
+	off_t seek(off_t offset, int whence = SEEK_SET) override
 	{
 		off_t res = lseek(fd, offset, whence);
 		if (-1 == res)
@@ -82,14 +83,19 @@ public:
 		return res;
 	}
 
-	void truncate(off_t length = 0)
+	off_t getPositionInFile() override
+	{
+		return seek(0, SEEK_CUR);
+	}
+
+	void truncate(off_t length = 0) override
 	{
 		int res = ftruncate(fd, length);
 		if (-1 == res)
 			throwFromErrno("Cannot truncate file " + getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
 	}
 
-	void sync()
+	void sync() override
 	{
 		/// Если в буфере ещё остались данные - запишем их.
 		next();
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 28347f8252c..3cdedbf8d94 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -11,8 +11,8 @@ namespace DB
 
 ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 	char * existing_memory_)
-	: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
-	fill_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
+	: ReadBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+	fill_buffer(BufferWithOwnMemory<ReadBuffer>(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 	filename(filename_)
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
@@ -64,7 +64,7 @@ void ReadBufferAIO::setMaxBytes(size_t max_bytes_read_)
 	max_bytes_read = max_bytes_read_;
 }
 
-off_t ReadBufferAIO::seek(off_t off, int whence)
+off_t ReadBufferAIO::seek(off_t off, int whence = SEEK_CUR)
 {
 	if ((off % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
 		throw Exception("Invalid offset for ReadBufferAIO::seek", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 792b169c423..a4f17bfa3ff 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -12,7 +12,7 @@ namespace DB
 
 WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 		char * existing_memory_)
-		: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+		: WriteBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 		flush_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 		filename(filename_)
 {

From bb1b66162960b1e75474254d7a12cc2134433a8e Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 00:05:44 +0300
Subject: [PATCH 05/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/ReadBufferAIO.h            | 11 +++++-----
 dbms/include/DB/IO/ReadBufferFromFile.h       |  2 +-
 .../DB/IO/ReadBufferFromFileDescriptor.h      | 13 ++++++------
 dbms/include/DB/IO/WriteBufferAIO.h           | 15 +++++++-------
 .../DB/IO/WriteBufferFromFileDescriptor.h     | 20 +++++++------------
 dbms/src/IO/ReadBufferAIO.cpp                 |  6 +++---
 dbms/src/IO/WriteBufferAIO.cpp                |  2 +-
 7 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index 78752273364..0630e5f1dc4 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -16,7 +15,7 @@ namespace DB
 /** Класс для асинхронного чтения данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class ReadBufferAIO : public ReadBufferFromFileBase
+class ReadBufferAIO : public BufferWithOwnMemory<ReadBuffer>
 {
 public:
 	ReadBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -27,10 +26,10 @@ public:
 	ReadBufferAIO & operator=(const ReadBufferAIO &) = delete;
 
 	void setMaxBytes(size_t max_bytes_read_);
-	off_t seek(off_t off, int whence) override;
-	off_t getPositionInFile() override;
-	std::string getFileName() const noexcept override { return filename; }
-	int getFD() const noexcept override { return fd; }
+	off_t seek(off_t off, int whence = SEEK_SET);
+	off_t getPositionInFile();
+	std::string getFileName() const noexcept { return filename; }
+	int getFD() const noexcept { return fd; }
 
 private:
 	off_t getPositionInFileRelaxed() const noexcept;
diff --git a/dbms/include/DB/IO/ReadBufferFromFile.h b/dbms/include/DB/IO/ReadBufferFromFile.h
index 4ff277ee98a..69ce0fdfc7f 100644
--- a/dbms/include/DB/IO/ReadBufferFromFile.h
+++ b/dbms/include/DB/IO/ReadBufferFromFile.h
@@ -33,7 +33,7 @@ public:
 		close(fd);
 	}
 
-	std::string getFileName() const noexcept override
+	virtual std::string getFileName()
 	{
 		return file_name;
 	}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
index 909811f57cc..b412055f509 100644
--- a/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/ReadBufferFromFileDescriptor.h
@@ -8,7 +8,6 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
-#include <DB/IO/ReadBufferFromFileBase.h>
 #include <DB/IO/ReadBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -19,7 +18,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class ReadBufferFromFileDescriptor : public ReadBufferFromFileBase
+class ReadBufferFromFileDescriptor : public BufferWithOwnMemory<ReadBuffer>
 {
 protected:
 	int fd;
@@ -54,22 +53,22 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName() const noexcept override
+	virtual std::string getFileName()
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	ReadBufferFromFileDescriptor(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: ReadBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
+		: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment), fd(fd_), pos_in_file(0) {}
 
-	int getFD() const noexcept override
+	int getFD()
 	{
 		return fd;
 	}
 
 	/// Если offset такой маленький, что мы не выйдем за пределы буфера, настоящий seek по файлу не делается.
-	off_t seek(off_t offset, int whence = SEEK_CUR) override
+	off_t seek(off_t offset, int whence = SEEK_SET)
 	{
 		off_t new_pos = offset;
 		if (whence == SEEK_CUR)
@@ -100,7 +99,7 @@ public:
 		}
 	}
 
-	off_t getPositionInFile() override
+	off_t getPositionInFile()
 	{
 		return pos_in_file - (working_buffer.end() - pos);
 	}
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 5ce79db30ce..274de538124 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
 #include <statdaemons/AIO.h>
@@ -15,7 +14,7 @@ namespace DB
 /** Класс для асинхронной записи данных.
   * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
-class WriteBufferAIO : public WriteBufferFromFileBase
+class WriteBufferAIO : public BufferWithOwnMemory<WriteBuffer>
 {
 public:
 	WriteBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666,
@@ -25,12 +24,12 @@ public:
 	WriteBufferAIO(const WriteBufferAIO &) = delete;
 	WriteBufferAIO & operator=(const WriteBufferAIO &) = delete;
 
-	off_t seek(off_t off, int whence = SEEK_SET) override;
-	off_t getPositionInFile() override;
-	void truncate(off_t length = 0) override;
-	void sync() override;
-	std::string getFileName() const noexcept override { return filename; }
-	int getFD() const noexcept override { return fd; }
+	off_t seek(off_t off, int whence = SEEK_SET);
+	off_t getPositionInFile();
+	void truncate(off_t length = 0);
+	void sync();
+	std::string getFileName() const noexcept { return filename; }
+	int getFD() const noexcept { return fd; }
 
 private:
 	/// Если в буфере ещё остались данные - запишем их.
diff --git a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
index def6f45c0a8..6dd76e9d642 100644
--- a/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
+++ b/dbms/include/DB/IO/WriteBufferFromFileDescriptor.h
@@ -6,7 +6,6 @@
 #include <DB/Core/Exception.h>
 #include <DB/Core/ErrorCodes.h>
 
-#include <DB/IO/WriteBufferFromFileBase.h>
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/WriteHelpers.h>
 #include <DB/IO/BufferWithOwnMemory.h>
@@ -17,7 +16,7 @@ namespace DB
 
 /** Работает с готовым файловым дескриптором. Не открывает и не закрывает файл.
   */
-class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
+class WriteBufferFromFileDescriptor : public BufferWithOwnMemory<WriteBuffer>
 {
 protected:
 	int fd;
@@ -41,14 +40,14 @@ protected:
 	}
 
 	/// Имя или описание файла
-	virtual std::string getFileName() const noexcept override
+	virtual std::string getFileName()
 	{
 		return "(fd = " + toString(fd) + ")";
 	}
 
 public:
 	WriteBufferFromFileDescriptor(int fd_ = -1, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
-		: WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
+		: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment), fd(fd_) {}
 
 	/** Можно вызывать для инициализации, если нужный fd не был передан в конструктор.
 	  * Менять fd во время работы нельзя.
@@ -70,12 +69,12 @@ public:
 		}
 	}
 
-	int getFD() const noexcept override
+	int getFD()
 	{
 		return fd;
 	}
 
-	off_t seek(off_t offset, int whence = SEEK_SET) override
+	off_t seek(off_t offset, int whence = SEEK_SET)
 	{
 		off_t res = lseek(fd, offset, whence);
 		if (-1 == res)
@@ -83,19 +82,14 @@ public:
 		return res;
 	}
 
-	off_t getPositionInFile() override
-	{
-		return seek(0, SEEK_CUR);
-	}
-
-	void truncate(off_t length = 0) override
+	void truncate(off_t length = 0)
 	{
 		int res = ftruncate(fd, length);
 		if (-1 == res)
 			throwFromErrno("Cannot truncate file " + getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
 	}
 
-	void sync() override
+	void sync()
 	{
 		/// Если в буфере ещё остались данные - запишем их.
 		next();
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 3cdedbf8d94..28347f8252c 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -11,8 +11,8 @@ namespace DB
 
 ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 	char * existing_memory_)
-	: ReadBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
-	fill_buffer(BufferWithOwnMemory<ReadBuffer>(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
+	: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+	fill_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 	filename(filename_)
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
@@ -64,7 +64,7 @@ void ReadBufferAIO::setMaxBytes(size_t max_bytes_read_)
 	max_bytes_read = max_bytes_read_;
 }
 
-off_t ReadBufferAIO::seek(off_t off, int whence = SEEK_CUR)
+off_t ReadBufferAIO::seek(off_t off, int whence)
 {
 	if ((off % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
 		throw Exception("Invalid offset for ReadBufferAIO::seek", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index a4f17bfa3ff..792b169c423 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -12,7 +12,7 @@ namespace DB
 
 WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 		char * existing_memory_)
-		: WriteBufferFromFileBase(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+		: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 		flush_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 		filename(filename_)
 {

From 73f0eac93bddc01257939caf1bf7c1757a23dee1 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 00:06:38 +0300
Subject: [PATCH 06/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/IReadFileOperations.h     | 20 ------------------
 dbms/include/DB/IO/IWriteFileOperations.h    | 22 --------------------
 dbms/include/DB/IO/ReadBufferFromFileBase.h  | 18 ----------------
 dbms/include/DB/IO/WriteBufferFromFileBase.h | 18 ----------------
 4 files changed, 78 deletions(-)
 delete mode 100644 dbms/include/DB/IO/IReadFileOperations.h
 delete mode 100644 dbms/include/DB/IO/IWriteFileOperations.h
 delete mode 100644 dbms/include/DB/IO/ReadBufferFromFileBase.h
 delete mode 100644 dbms/include/DB/IO/WriteBufferFromFileBase.h

diff --git a/dbms/include/DB/IO/IReadFileOperations.h b/dbms/include/DB/IO/IReadFileOperations.h
deleted file mode 100644
index d370494ccda..00000000000
--- a/dbms/include/DB/IO/IReadFileOperations.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#pragma once
-
-#include <string>
-#include <fcntl.h>
-#include <sys/types.h>
-
-namespace DB
-{
-
-class IReadFileOperations
-{
-public:
-	virtual ~IReadFileOperations() = default;
-	virtual off_t seek(off_t off, int whence) = 0;
-	virtual off_t getPositionInFile() = 0;
-	virtual std::string getFileName() const noexcept = 0;
-	virtual int getFD() const noexcept = 0;
-};
-
-}
diff --git a/dbms/include/DB/IO/IWriteFileOperations.h b/dbms/include/DB/IO/IWriteFileOperations.h
deleted file mode 100644
index 0e0a0466d5f..00000000000
--- a/dbms/include/DB/IO/IWriteFileOperations.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <string>
-#include <fcntl.h>
-#include <sys/types.h>
-
-namespace DB
-{
-
-class IWriteFileOperations
-{
-public:
-	virtual ~IWriteFileOperations() = default;
-	virtual off_t seek(off_t off, int whence) = 0;
-	virtual off_t getPositionInFile() = 0;
-	virtual void truncate(off_t length) = 0;
-	virtual void sync() = 0;
-	virtual std::string getFileName() const noexcept = 0;
-	virtual int getFD() const noexcept = 0;
-};
-
-}
diff --git a/dbms/include/DB/IO/ReadBufferFromFileBase.h b/dbms/include/DB/IO/ReadBufferFromFileBase.h
deleted file mode 100644
index 3e8c3f63ca1..00000000000
--- a/dbms/include/DB/IO/ReadBufferFromFileBase.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include <DB/IO/IReadFileOperations.h>
-#include <DB/IO/BufferWithOwnMemory.h>
-#include <DB/IO/ReadBuffer.h>
-
-namespace DB
-{
-
-class ReadBufferFromFileBase : public IReadFileOperations, public BufferWithOwnMemory<ReadBuffer>
-{
-public:
-	ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
-	: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment) {}
-	virtual ~ReadBufferFromFileBase() = default;
-};
-
-}
diff --git a/dbms/include/DB/IO/WriteBufferFromFileBase.h b/dbms/include/DB/IO/WriteBufferFromFileBase.h
deleted file mode 100644
index 7a6e582e63f..00000000000
--- a/dbms/include/DB/IO/WriteBufferFromFileBase.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include <DB/IO/IWriteFileOperations.h>
-#include <DB/IO/BufferWithOwnMemory.h>
-#include <DB/IO/WriteBuffer.h>
-
-namespace DB
-{
-
-class WriteBufferFromFileBase : public IWriteFileOperations, public BufferWithOwnMemory<WriteBuffer>
-{
-public:
-	WriteBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
-	: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment) {}
-	virtual ~WriteBufferFromFileBase() = default;
-};
-
-}

From 53c1253ffe162145a6e6635a9b3d5708eecbbbc2 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 00:39:27 +0300
Subject: [PATCH 07/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/ReadBufferAIO.cpp  | 2 +-
 dbms/src/IO/WriteBufferAIO.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 28347f8252c..64a18237604 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -11,7 +11,7 @@ namespace DB
 
 ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 	char * existing_memory_)
-	: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+	: BufferWithOwnMemory<ReadBuffer>(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 	fill_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 	filename(filename_)
 {
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 792b169c423..4c086b1ba56 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -12,7 +12,7 @@ namespace DB
 
 WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, mode_t mode_,
 		char * existing_memory_)
-		: BufferWithOwnMemory(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
+		: BufferWithOwnMemory<WriteBuffer>(buffer_size_, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE),
 		flush_buffer(BufferWithOwnMemory(buffer_size_, nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)),
 		filename(filename_)
 {

From 9ab8c585ac056f96f96a79c3a0c8f2753badd15f Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 14:30:23 +0300
Subject: [PATCH 08/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/BufferBase.h       |  2 +
 dbms/include/DB/IO/ReadBuffer.h       |  3 +-
 dbms/src/IO/ReadBufferAIO.cpp         | 22 +++++------
 dbms/src/IO/tests/read_buffer_aio.cpp | 57 ++++++++++++++++++---------
 4 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/dbms/include/DB/IO/BufferBase.h b/dbms/include/DB/IO/BufferBase.h
index a986e3218bb..0fd3e41e533 100644
--- a/dbms/include/DB/IO/BufferBase.h
+++ b/dbms/include/DB/IO/BufferBase.h
@@ -99,6 +99,8 @@ protected:
 	  */
 	Buffer working_buffer;
 
+	size_t working_buffer_offset = 0;
+
 	/// Позиция чтения/записи.
 	Position pos;
 
diff --git a/dbms/include/DB/IO/ReadBuffer.h b/dbms/include/DB/IO/ReadBuffer.h
index 4e97039fce9..e089a02cc6a 100644
--- a/dbms/include/DB/IO/ReadBuffer.h
+++ b/dbms/include/DB/IO/ReadBuffer.h
@@ -47,7 +47,8 @@ public:
 		if (!res)
 			working_buffer.resize(0);
 
-		pos = working_buffer.begin();
+		pos = working_buffer.begin() + working_buffer_offset;
+		working_buffer_offset = 0;
 		return res;
 	}
 
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index 64a18237604..ea680e4b44c 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -66,9 +66,6 @@ void ReadBufferAIO::setMaxBytes(size_t max_bytes_read_)
 
 off_t ReadBufferAIO::seek(off_t off, int whence)
 {
-	if ((off % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		throw Exception("Invalid offset for ReadBufferAIO::seek", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
-
 	waitForAIOCompletion();
 
 	off_t new_pos;
@@ -157,7 +154,7 @@ bool ReadBufferAIO::nextImpl()
 	request.aio_fildes = fd;
 	request.aio_buf = reinterpret_cast<UInt64>(fill_buffer.internalBuffer().begin());
 	request.aio_nbytes = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);
-	request.aio_offset = pos_in_file;
+	request.aio_offset = pos_in_file - (pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE);
 
 	/// Отправить запрос.
 	while (io_submit(aio_context.ctx, request_ptrs.size(), &request_ptrs[0]) < 0)
@@ -194,23 +191,24 @@ void ReadBufferAIO::waitForAIOCompletion()
 			got_exception = true;
 			throw Exception("Asynchronous read error on file " + filename, ErrorCodes::AIO_READ_ERROR);
 		}
-		if ((bytes_read % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		{
-			got_exception = true;
-			throw Exception("Received unaligned number of bytes from file " + filename, ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
-		}
 		if (pos_in_file > (std::numeric_limits<off_t>::max() - bytes_read))
 		{
 			got_exception = true;
 			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
 		}
 
+		if (bytes_read > 0)
+			fill_buffer.buffer().resize(bytes_read);
+		if (static_cast<size_t>(bytes_read) < fill_buffer.internalBuffer().size())
+			is_eof = true;
+
+		working_buffer_offset = pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE;
+		bytes_read -= working_buffer_offset;
+
 		pos_in_file += bytes_read;
 		total_bytes_read += bytes_read;
 
-		if (bytes_read > 0)
-			fill_buffer.buffer().resize(bytes_read);
-		if ((static_cast<size_t>(bytes_read) < fill_buffer.internalBuffer().size()) || (total_bytes_read == max_bytes_read))
+		if (total_bytes_read == max_bytes_read)
 			is_eof = true;
 	}
 }
diff --git a/dbms/src/IO/tests/read_buffer_aio.cpp b/dbms/src/IO/tests/read_buffer_aio.cpp
index 3dae95e9f7e..4d76569c9ad 100644
--- a/dbms/src/IO/tests/read_buffer_aio.cpp
+++ b/dbms/src/IO/tests/read_buffer_aio.cpp
@@ -14,7 +14,7 @@ namespace
 {
 
 void run();
-void prepare(std::string & directory, std::string  & filename, std::string & buf);
+void prepare(size_t s, std::string & directory, std::string  & filename, std::string & buf);
 void die(const std::string & msg);
 void run_test(unsigned int num, const std::function<bool()> func);
 
@@ -25,11 +25,12 @@ bool test4(const std::string & filename, const std::string & buf);
 bool test5(const std::string & filename);
 bool test6(const std::string & filename, const std::string & buf);
 bool test7(const std::string & filename, const std::string & buf);
-bool test8(const std::string & filename);
+bool test8(const std::string & filename, const std::string & buf);
 bool test9(const std::string & filename, const std::string & buf);
 bool test10(const std::string & filename, const std::string & buf);
 bool test11(const std::string & filename);
 bool test12(const std::string & filename, const std::string & buf);
+bool test13(const std::string & filename, const std::string & buf);
 
 void run()
 {
@@ -38,7 +39,12 @@ void run()
 	std::string directory;
 	std::string filename;
 	std::string buf;
-	prepare(directory, filename, buf);
+	prepare(10 * DEFAULT_AIO_FILE_BLOCK_SIZE, directory, filename, buf);
+
+	std::string directory2;
+	std::string filename2;
+	std::string buf2;
+	prepare(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 2, directory2, filename2, buf2);
 
 	const std::vector<std::function<bool()> > tests =
 	{
@@ -49,11 +55,12 @@ void run()
 		std::bind(test5, std::ref(filename)),
 		std::bind(test6, std::ref(filename), std::ref(buf)),
 		std::bind(test7, std::ref(filename), std::ref(buf)),
-		std::bind(test8, std::ref(filename)),
+		std::bind(test8, std::ref(filename), std::ref(buf)),
 		std::bind(test9, std::ref(filename), std::ref(buf)),
 		std::bind(test10, std::ref(filename), std::ref(buf)),
 		std::bind(test11, std::ref(filename)),
-		std::bind(test12, std::ref(filename), std::ref(buf))
+		std::bind(test12, std::ref(filename), std::ref(buf)),
+		std::bind(test13, std::ref(filename2), std::ref(buf2))
 	};
 
 	unsigned int num = 0;
@@ -64,9 +71,10 @@ void run()
 	}
 
 	fs::remove_all(directory);
+	fs::remove_all(directory2);
 }
 
-void prepare(std::string & directory, std::string  & filename, std::string & buf)
+void prepare(size_t s, std::string & directory, std::string  & filename, std::string & buf)
 {
 	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
 
@@ -222,21 +230,22 @@ bool test7(const std::string & filename, const std::string & buf)
 	return (newbuf == buf.substr(DEFAULT_AIO_FILE_BLOCK_SIZE));
 }
 
-bool test8(const std::string & filename)
+bool test8(const std::string & filename, const std::string & buf)
 {
-	bool ok = false;
+	std::string newbuf;
+	newbuf.resize(DEFAULT_AIO_FILE_BLOCK_SIZE - 1);
 
-	try
-	{
-		DB::ReadBufferAIO in(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
-		(void) in.seek(DEFAULT_AIO_FILE_BLOCK_SIZE + 1, SEEK_CUR);
-	}
-	catch (const DB::Exception &)
-	{
-		ok = true;
-	}
+	DB::ReadBufferAIO in(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+	(void) in.seek(DEFAULT_AIO_FILE_BLOCK_SIZE + 1, SEEK_CUR);
+	size_t count = in.read(&newbuf[0], newbuf.length());
 
-	return ok;
+	if (count != newbuf.length())
+		return false;
+
+	if (newbuf != buf.substr(DEFAULT_AIO_FILE_BLOCK_SIZE + 1, newbuf.length()))
+		return false;
+
+	return true;
 }
 
 bool test9(const std::string & filename, const std::string & buf)
@@ -328,6 +337,18 @@ bool test12(const std::string & filename, const std::string & buf)
 	return ok;
 }
 
+bool test13(const std::string & filename, const std::string & buf)
+{
+	std::string newbuf;
+	newbuf.resize(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 2);
+
+	DB::ReadBufferAIO in(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
+	size_t count1 = in.read(&newbuf[0], newbuf.length());
+	if (count1 != newbuf.size())
+		return false;
+	return true;
+}
+
 }
 
 int main()

From f4b7889f90cd4580c27aa383ee3e24df7c0de22b Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 15:44:49 +0300
Subject: [PATCH 09/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/ReadBufferAIO.h    |  2 +-
 dbms/src/IO/ReadBufferAIO.cpp         | 15 ++++---
 dbms/src/IO/tests/read_buffer_aio.cpp | 57 +++++++++++++++++++--------
 3 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/dbms/include/DB/IO/ReadBufferAIO.h b/dbms/include/DB/IO/ReadBufferAIO.h
index 0630e5f1dc4..4d27a99efa8 100644
--- a/dbms/include/DB/IO/ReadBufferAIO.h
+++ b/dbms/include/DB/IO/ReadBufferAIO.h
@@ -13,7 +13,6 @@ namespace DB
 {
 
 /** Класс для асинхронного чтения данных.
-  * Все размеры и смещения должны быть кратны DEFAULT_AIO_FILE_BLOCK_SIZE байтам.
   */
 class ReadBufferAIO : public BufferWithOwnMemory<ReadBuffer>
 {
@@ -53,6 +52,7 @@ private:
 
 	size_t max_bytes_read = std::numeric_limits<size_t>::max();
 	size_t total_bytes_read = 0;
+	size_t requested_byte_count = 0;
 	off_t pos_in_file = 0;
 	int fd = -1;
 
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index ea680e4b44c..f1e22da886b 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -56,11 +56,6 @@ void ReadBufferAIO::setMaxBytes(size_t max_bytes_read_)
 		got_exception = true;
 		throw Exception("Illegal attempt to set the maximum number of bytes to read from file " + filename, ErrorCodes::LOGICAL_ERROR);
 	}
-	if ((max_bytes_read_ % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-	{
-		got_exception = true;
-		throw Exception("Invalid maximum number of bytes to read from file " + filename, ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
-	}
 	max_bytes_read = max_bytes_read_;
 }
 
@@ -153,7 +148,13 @@ bool ReadBufferAIO::nextImpl()
 	request.aio_lio_opcode = IOCB_CMD_PREAD;
 	request.aio_fildes = fd;
 	request.aio_buf = reinterpret_cast<UInt64>(fill_buffer.internalBuffer().begin());
-	request.aio_nbytes = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);
+
+	requested_byte_count = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);		
+
+	request.aio_nbytes = requested_byte_count;
+	if ((request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
+		request.aio_nbytes += DEFAULT_AIO_FILE_BLOCK_SIZE - (request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE);
+
 	request.aio_offset = pos_in_file - (pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE);
 
 	/// Отправить запрос.
@@ -197,6 +198,8 @@ void ReadBufferAIO::waitForAIOCompletion()
 			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
 		}
 
+		bytes_read = std::min(bytes_read, static_cast<off_t>(requested_byte_count));
+
 		if (bytes_read > 0)
 			fill_buffer.buffer().resize(bytes_read);
 		if (static_cast<size_t>(bytes_read) < fill_buffer.internalBuffer().size())
diff --git a/dbms/src/IO/tests/read_buffer_aio.cpp b/dbms/src/IO/tests/read_buffer_aio.cpp
index 4d76569c9ad..286fd6a662f 100644
--- a/dbms/src/IO/tests/read_buffer_aio.cpp
+++ b/dbms/src/IO/tests/read_buffer_aio.cpp
@@ -22,7 +22,7 @@ bool test1(const std::string & filename);
 bool test2(const std::string & filename, const std::string & buf);
 bool test3(const std::string & filename, const std::string & buf);
 bool test4(const std::string & filename, const std::string & buf);
-bool test5(const std::string & filename);
+bool test5(const std::string & filename, const std::string & buf);
 bool test6(const std::string & filename, const std::string & buf);
 bool test7(const std::string & filename, const std::string & buf);
 bool test8(const std::string & filename, const std::string & buf);
@@ -31,6 +31,7 @@ bool test10(const std::string & filename, const std::string & buf);
 bool test11(const std::string & filename);
 bool test12(const std::string & filename, const std::string & buf);
 bool test13(const std::string & filename, const std::string & buf);
+bool test14(const std::string & filename, const std::string & buf);
 
 void run()
 {
@@ -44,7 +45,7 @@ void run()
 	std::string directory2;
 	std::string filename2;
 	std::string buf2;
-	prepare(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 2, directory2, filename2, buf2);
+	prepare(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 3, directory2, filename2, buf2);
 
 	const std::vector<std::function<bool()> > tests =
 	{
@@ -52,7 +53,7 @@ void run()
 		std::bind(test2, std::ref(filename), std::ref(buf)),
 		std::bind(test3, std::ref(filename), std::ref(buf)),
 		std::bind(test4, std::ref(filename), std::ref(buf)),
-		std::bind(test5, std::ref(filename)),
+		std::bind(test5, std::ref(filename), std::ref(buf)),
 		std::bind(test6, std::ref(filename), std::ref(buf)),
 		std::bind(test7, std::ref(filename), std::ref(buf)),
 		std::bind(test8, std::ref(filename), std::ref(buf)),
@@ -60,7 +61,8 @@ void run()
 		std::bind(test10, std::ref(filename), std::ref(buf)),
 		std::bind(test11, std::ref(filename)),
 		std::bind(test12, std::ref(filename), std::ref(buf)),
-		std::bind(test13, std::ref(filename2), std::ref(buf2))
+		std::bind(test13, std::ref(filename2), std::ref(buf2)),
+		std::bind(test14, std::ref(filename), std::ref(buf))
 	};
 
 	unsigned int num = 0;
@@ -180,20 +182,22 @@ bool test4(const std::string & filename, const std::string & buf)
 	return n_read == 0;
 }
 
-bool test5(const std::string & filename)
+bool test5(const std::string & filename, const std::string & buf)
 {
-	bool ok = false;
+	std::string newbuf;
+	newbuf.resize(1 + (DEFAULT_AIO_FILE_BLOCK_SIZE >> 1));
 
-	try
-	{
-		DB::ReadBufferAIO in(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
-		in.setMaxBytes(DEFAULT_AIO_FILE_BLOCK_SIZE >> 1);
-	}
-	catch (const DB::Exception &)
-	{
-		ok = true;
-	}
-	return ok;
+	DB::ReadBufferAIO in(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
+	in.setMaxBytes(1 + (DEFAULT_AIO_FILE_BLOCK_SIZE >> 1));
+
+	size_t count = in.read(&newbuf[0], newbuf.length());
+	if (count != newbuf.length())
+		return false;
+
+	if (newbuf != buf.substr(0, newbuf.size()))
+		return false;
+
+	return true;
 }
 
 bool test6(const std::string & filename, const std::string & buf)
@@ -340,7 +344,7 @@ bool test12(const std::string & filename, const std::string & buf)
 bool test13(const std::string & filename, const std::string & buf)
 {
 	std::string newbuf;
-	newbuf.resize(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 2);
+	newbuf.resize(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - 3);
 
 	DB::ReadBufferAIO in(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
 	size_t count1 = in.read(&newbuf[0], newbuf.length());
@@ -349,6 +353,25 @@ bool test13(const std::string & filename, const std::string & buf)
 	return true;
 }
 
+bool test14(const std::string & filename, const std::string & buf)
+{
+	std::string newbuf;
+	newbuf.resize(1 + (DEFAULT_AIO_FILE_BLOCK_SIZE >> 1));
+
+	DB::ReadBufferAIO in(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
+	(void) in.seek(2, SEEK_SET);
+	in.setMaxBytes(3 + (DEFAULT_AIO_FILE_BLOCK_SIZE >> 1));
+
+	size_t count = in.read(&newbuf[0], newbuf.length());
+	if (count != newbuf.length())
+		return false;
+
+	if (newbuf != buf.substr(2, newbuf.length()))
+		return false;
+
+	return true;
+}
+
 }
 
 int main()

From 6c3bd8759d2b8aa0d1dcd10ae01cf08fc4311402 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 17 Mar 2015 16:57:24 +0300
Subject: [PATCH 10/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/BufferBase.h       |  1 +
 dbms/src/IO/ReadBufferAIO.cpp         | 23 +++++++++++++++--------
 dbms/src/IO/tests/read_buffer_aio.cpp |  4 ++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/dbms/include/DB/IO/BufferBase.h b/dbms/include/DB/IO/BufferBase.h
index 0fd3e41e533..b8f0f1a0bfc 100644
--- a/dbms/include/DB/IO/BufferBase.h
+++ b/dbms/include/DB/IO/BufferBase.h
@@ -99,6 +99,7 @@ protected:
 	  */
 	Buffer working_buffer;
 
+	/// Количество игнорируемых байтов с начальной позиции буфера working_buffer.
 	size_t working_buffer_offset = 0;
 
 	/// Позиция чтения/записи.
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index f1e22da886b..5eb114fc0bc 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -144,18 +144,23 @@ bool ReadBufferAIO::nextImpl()
 	if (is_eof)
 		return true;
 
+	/// Количество запрашиваемых байтов.
+	requested_byte_count = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);
+
+	/// Для запроса выравниваем количество запрашиваемых байтов на границе следующего блока.
+	size_t effective_byte_count = requested_byte_count;
+	if ((effective_byte_count % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
+		effective_byte_count += DEFAULT_AIO_FILE_BLOCK_SIZE - (effective_byte_count % DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+	/// Также выравниваем позицию в файле на границе предыдущего блока.
+	off_t effective_pos_in_file = pos_in_file - (pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE);
+
 	/// Создать запрос.
 	request.aio_lio_opcode = IOCB_CMD_PREAD;
 	request.aio_fildes = fd;
 	request.aio_buf = reinterpret_cast<UInt64>(fill_buffer.internalBuffer().begin());
-
-	requested_byte_count = std::min(fill_buffer.internalBuffer().size(), max_bytes_read);		
-
-	request.aio_nbytes = requested_byte_count;
-	if ((request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		request.aio_nbytes += DEFAULT_AIO_FILE_BLOCK_SIZE - (request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE);
-
-	request.aio_offset = pos_in_file - (pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE);
+	request.aio_nbytes = effective_byte_count;
+	request.aio_offset = effective_pos_in_file;
 
 	/// Отправить запрос.
 	while (io_submit(aio_context.ctx, request_ptrs.size(), &request_ptrs[0]) < 0)
@@ -198,6 +203,7 @@ void ReadBufferAIO::waitForAIOCompletion()
 			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
 		}
 
+		/// Игнорируем излишние байты справа.
 		bytes_read = std::min(bytes_read, static_cast<off_t>(requested_byte_count));
 
 		if (bytes_read > 0)
@@ -205,6 +211,7 @@ void ReadBufferAIO::waitForAIOCompletion()
 		if (static_cast<size_t>(bytes_read) < fill_buffer.internalBuffer().size())
 			is_eof = true;
 
+		/// Игнорируем излишние байты слева.
 		working_buffer_offset = pos_in_file % DEFAULT_AIO_FILE_BLOCK_SIZE;
 		bytes_read -= working_buffer_offset;
 
diff --git a/dbms/src/IO/tests/read_buffer_aio.cpp b/dbms/src/IO/tests/read_buffer_aio.cpp
index 286fd6a662f..faf8c60947a 100644
--- a/dbms/src/IO/tests/read_buffer_aio.cpp
+++ b/dbms/src/IO/tests/read_buffer_aio.cpp
@@ -194,7 +194,7 @@ bool test5(const std::string & filename, const std::string & buf)
 	if (count != newbuf.length())
 		return false;
 
-	if (newbuf != buf.substr(0, newbuf.size()))
+	if (newbuf != buf.substr(0, newbuf.length()))
 		return false;
 
 	return true;
@@ -348,7 +348,7 @@ bool test13(const std::string & filename, const std::string & buf)
 
 	DB::ReadBufferAIO in(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
 	size_t count1 = in.read(&newbuf[0], newbuf.length());
-	if (count1 != newbuf.size())
+	if (count1 != newbuf.length())
 		return false;
 	return true;
 }

From 2ec5cd0506e15cdbd5d901dcc440099c1c8763c0 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Thu, 19 Mar 2015 13:06:11 +0300
Subject: [PATCH 11/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/WriteBufferAIO.h |   7 ++
 dbms/src/IO/WriteBufferAIO.cpp      | 143 ++++++++++++++++++++++++----
 2 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 274de538124..7fb66f96255 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -2,11 +2,13 @@
 
 #include <DB/IO/WriteBuffer.h>
 #include <DB/IO/BufferWithOwnMemory.h>
+#include <DB/Core/Defines.h>
 #include <statdaemons/AIO.h>
 
 #include <string>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/uio.h>
 
 namespace DB
 {
@@ -51,6 +53,11 @@ private:
 
 	AIOContext aio_context{1};
 
+	iovec iov[3];
+
+	Memory left_page{DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE};
+	Memory right_page{DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE};
+
 	const std::string filename;
 
 	off_t pos_in_file = 0;
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 4c086b1ba56..7b5ba09e871 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -1,7 +1,6 @@
 #include <DB/IO/WriteBufferAIO.h>
 #include <DB/Common/ProfileEvents.h>
 #include <DB/Core/ErrorCodes.h>
-#include <DB/Core/Defines.h>
 
 #include <limits>
 #include <sys/types.h>
@@ -52,9 +51,6 @@ WriteBufferAIO::~WriteBufferAIO()
 
 off_t WriteBufferAIO::seek(off_t off, int whence)
 {
-	if ((off % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		throw Exception("Invalid offset for WriteBufferAIO::seek", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
-
 	flush();
 
 	if (whence == SEEK_SET)
@@ -99,9 +95,6 @@ off_t WriteBufferAIO::getPositionInFile()
 
 void WriteBufferAIO::truncate(off_t length)
 {
-	if ((length % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		throw Exception("Invalid length for WriteBufferAIO::ftruncate", ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
-
 	flush();
 
 	int res = ::ftruncate(fd, length);
@@ -139,18 +132,136 @@ void WriteBufferAIO::nextImpl()
 	waitForAIOCompletion();
 	swapBuffers();
 
-	/// Создать запрос.
-	request.aio_lio_opcode = IOCB_CMD_PWRITE;
-	request.aio_fildes = fd;
-	request.aio_buf = reinterpret_cast<UInt64>(flush_buffer.buffer().begin());
-	request.aio_nbytes = flush_buffer.offset();
-	request.aio_offset = pos_in_file;
+	/// Input parameters: fd, pos_in_file, flush_buffer
 
-	if ((request.aio_nbytes % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
+	/*
+			region_aligned_begin     region_begin                             region_end      region_aligned_end
+			|                           |                                          |                           |
+			|     +---------------------+                                          +----------------------+    |
+			|     |                                                                                       |    |
+			+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
+			|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|
+	+-------|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|-------+
+	|		|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|       |
+	|		|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|       |
+	|		+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+       |
+(1)	|           |                                       ^                                                 |            |(1)
+read|			+---- left padded disk page             |                      right padded disk page ----+            |read
+    |                                                   |                                                              |
+	|		+--------+ (left padded page)               |                        (right padded page)  +--------+       |
+	|		|XXXXX*YY|                                  |                                             |ZZZ*XXXX|       |
+	|		|XXXXX*YY|--------------------------------->+<--------------------------------------------|ZZZ*XXXX|       |
+	+------>|XXXXX*YY|<--+                              | (3) scattered write           +------------>|ZZZ*XXXX|<------+
+			|XXXXX*YY|   |                              |                               |             |ZZZ*XXXX|
+			+--------+   |(2)copy                       |                               |(2)copy      +--------+
+                         |                              |                               |
+	+--------------------+                              |                               +--------------------+
+	|                                                   |                                                    |
+	|		buffer_begin     aligned_buffer_begin.......+...........aligned_buffer_end     buffer_end        |
+	|		|                        |                                   |                          |        |
+	|		|  +---------------------+                                   +----------------------+   |        |
+	|		|  |                                                                                |   |        |
+	|		---+--------+--------+--------+--------+--------+--------+--------+--------+--------+----        |
+	|		*YY:        :        :        :        :        :        :        :        :        :ZZZ*        |
+	|		*YY:        :        :        :        :        :        :        :        :        :ZZZ*        |
+	+-------*YY:        :        :        :        :        :        :        :        :        :ZZZ*--------+
+			*YY:        :        :        :        :        :        :        :        :        :ZZZ*
+			---+--------+--------+--------+--------+--------+--------+--------+--------+--------+----
+
+	 */
+
+	//
+	// 1. Determine the enclosing page-aligned disk region.
+	//
+
+	/// Disk region we want to write to.
+	size_t region_begin = pos_in_file;
+	size_t region_end = pos_in_file + flush_buffer.offset();
+
+	/// Page-aligned disk region.
+	size_t region_aligned_begin = region_begin - (region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE);
+	size_t region_aligned_end = region_end;
+	if ((region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
+		region_aligned_end += DEFAULT_AIO_FILE_BLOCK_SIZE - (region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+	bool has_left_padding = (region_aligned_begin != region_begin);
+	bool has_right_padding = (region_aligned_end != region_end);
+
+	//
+	// 2. Read needed data from disk into padded pages.
+	//
+
+	if (has_left_padding)
 	{
-		got_exception = true;
-		throw Exception("Illegal attempt to write unaligned data to file " + filename, ErrorCodes::AIO_UNALIGNED_SIZE_ERROR);
+		/// Left-side padding disk region.
+		ssize_t read_count = ::pread(fd, &left_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, region_aligned_begin);
+		if (read_count < 0)
+			throw Exception("Read error");
 	}
+	if (has_right_padding)
+	{
+		/// Right-side padding disk region.
+		ssize_t read_count = ::pread(fd, &right_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, (region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE));
+		if (read_count < 0)
+			throw Exception("Read error");
+	}
+
+	//
+	// 3. Copy padding data (2 user-space copies) from the buffer into the padded pages.
+	//
+
+	/// Buffer we want to write to disk.
+	Position buffer_begin = flush_buffer.buffer().begin();
+	Position buffer_end = buffer_begin + flush_buffer.offset();
+
+	/// Subset of the buffer that is page-aligned.
+	Position aligned_buffer_begin = buffer_begin;
+	Position aligned_buffer_end = buffer_end;
+
+	if (has_left_padding)
+	{
+		size_t left_page_unmodified_size = region_begin - region_aligned_begin;
+		size_t left_page_modified_size = DEFAULT_AIO_FILE_BLOCK_SIZE - left_page_unmodified_size;
+		aligned_buffer_begin += left_page_modified_size;
+		::memcpy(&left_page[0] + left_page_unmodified_size, buffer_begin, left_page_modified_size);
+	}
+	if (has_right_padding)
+	{
+		size_t right_page_begin = region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE;
+		size_t right_page_modified_size = region_end - right_page_begin;
+		aligned_buffer_end -= right_page_modified_size;
+		::memcpy(&right_page[0], (buffer_end - right_page_modified_size), right_page_modified_size);
+	}
+
+	//
+	// 4. Create requests.
+	//
+
+	size_t i = 0;
+
+	if (has_left_padding)
+	{
+		iov[i].iov_base = &left_page[0];
+		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
+		++i;
+	}
+
+	iov[i].iov_base = aligned_buffer_begin;
+	iov[i].iov_len = aligned_buffer_end - aligned_buffer_begin;
+	++i;
+
+	if (has_right_padding)
+	{
+		iov[i].iov_base = &right_page[0];
+		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
+	}
+
+	/// Send requests (1 syscall).
+	request.aio_lio_opcode = IOCB_CMD_PWRITEV;
+	request.aio_fildes = fd;
+	request.aio_buf = reinterpret_cast<UInt64>(iov);
+	request.aio_nbytes = i + 1;
+	request.aio_offset = region_aligned_begin;
 
 	/// Отправить запрос.
 	while (io_submit(aio_context.ctx, request_ptrs.size(), &request_ptrs[0]) < 0)

From b88431e1ad449a8046579e0c97545f4dc74165b4 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Fri, 20 Mar 2015 19:58:57 +0300
Subject: [PATCH 12/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/Core/ErrorCodes.h   |  1 -
 dbms/include/DB/IO/WriteBufferAIO.h |  2 ++
 dbms/src/IO/WriteBufferAIO.cpp      | 23 ++++++++++++++++++++---
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/dbms/include/DB/Core/ErrorCodes.h b/dbms/include/DB/Core/ErrorCodes.h
index 61bb4c93382..e294bda746e 100644
--- a/dbms/include/DB/Core/ErrorCodes.h
+++ b/dbms/include/DB/Core/ErrorCodes.h
@@ -283,7 +283,6 @@ namespace ErrorCodes
 		AIO_COMPLETION_ERROR,
 		AIO_READ_ERROR,
 		AIO_WRITE_ERROR,
-		AIO_UNALIGNED_SIZE_ERROR,
 
 		POCO_EXCEPTION = 1000,
 		STD_EXCEPTION,
diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 7fb66f96255..26dffd26e80 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -60,6 +60,8 @@ private:
 
 	const std::string filename;
 
+	off_t truncate_count = 0;
+
 	off_t pos_in_file = 0;
 	int fd = -1;
 
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 7b5ba09e871..94398899587 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -17,7 +17,8 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
 
-	int open_flags = (flags_ == -1) ? (O_WRONLY | O_TRUNC | O_CREAT) : flags_;
+	/// About O_RDWR: yep, we really mean it.
+	int open_flags = (flags_ == -1) ? (O_RDWR | O_TRUNC | O_CREAT) : flags_;
 	open_flags |= O_DIRECT;
 
 	fd = ::open(filename.c_str(), open_flags, mode_);
@@ -132,6 +133,8 @@ void WriteBufferAIO::nextImpl()
 	waitForAIOCompletion();
 	swapBuffers();
 
+	truncate_count = 0;
+
 	/// Input parameters: fd, pos_in_file, flush_buffer
 
 	/*
@@ -196,14 +199,15 @@ read|			+---- left padded disk page             |                      right pad
 		/// Left-side padding disk region.
 		ssize_t read_count = ::pread(fd, &left_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, region_aligned_begin);
 		if (read_count < 0)
-			throw Exception("Read error");
+			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
 	}
 	if (has_right_padding)
 	{
 		/// Right-side padding disk region.
 		ssize_t read_count = ::pread(fd, &right_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, (region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE));
 		if (read_count < 0)
-			throw Exception("Read error");
+			throw Exception("Read error", ErrorCodes::AIO_WRITE_ERROR);
+		truncate_count = DEFAULT_AIO_FILE_BLOCK_SIZE - read_count;
 	}
 
 	//
@@ -303,6 +307,19 @@ void WriteBufferAIO::waitForAIOCompletion()
 			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
 		}
 
+		bytes_written -= truncate_count;
+
+		// Delete the trailing zeroes that were added for alignment purposes.
+		if (truncate_count > 0)
+		{
+			int res = ::ftruncate(fd, truncate_count);
+			if (res == -1)
+			{
+				got_exception = true;
+				throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
+			}
+		}
+
 		pos_in_file += bytes_written;
 	}
 }

From f97565add9b6b4df84003982da9b7830f1ded6a8 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 23 Mar 2015 12:43:06 +0300
Subject: [PATCH 13/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/WriteBufferAIO.h |  1 +
 dbms/src/IO/WriteBufferAIO.cpp      | 38 ++++++++++++++++++++---------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 26dffd26e80..caa6791e1e2 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -60,6 +60,7 @@ private:
 
 	const std::string filename;
 
+	off_t bytes_to_write = 0;
 	off_t truncate_count = 0;
 
 	off_t pos_in_file = 0;
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 94398899587..7667e902876 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -187,8 +187,8 @@ read|			+---- left padded disk page             |                      right pad
 	if ((region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
 		region_aligned_end += DEFAULT_AIO_FILE_BLOCK_SIZE - (region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE);
 
-	bool has_left_padding = (region_aligned_begin != region_begin);
-	bool has_right_padding = (region_aligned_end != region_end);
+	bool has_left_padding = (region_aligned_begin < region_begin);
+	bool has_right_padding = (region_aligned_end > region_end);
 
 	//
 	// 2. Read needed data from disk into padded pages.
@@ -197,6 +197,7 @@ read|			+---- left padded disk page             |                      right pad
 	if (has_left_padding)
 	{
 		/// Left-side padding disk region.
+		::memset(&left_page[0], 0, left_page.size());
 		ssize_t read_count = ::pread(fd, &left_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, region_aligned_begin);
 		if (read_count < 0)
 			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
@@ -204,6 +205,7 @@ read|			+---- left padded disk page             |                      right pad
 	if (has_right_padding)
 	{
 		/// Right-side padding disk region.
+		::memset(&right_page[0], 0, right_page.size());
 		ssize_t read_count = ::pread(fd, &right_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, (region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE));
 		if (read_count < 0)
 			throw Exception("Read error", ErrorCodes::AIO_WRITE_ERROR);
@@ -258,13 +260,26 @@ read|			+---- left padded disk page             |                      right pad
 	{
 		iov[i].iov_base = &right_page[0];
 		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
+		++i;
+	}
+
+	bytes_to_write = 0;
+	for (size_t j = 0; j < i; ++j)
+	{
+		if ((iov[i].iov_len > std::numeric_limits<off_t>::max()) ||
+			(static_cast<off_t>(iov[i].iov_len) > (std::numeric_limits<off_t>::max() - bytes_to_write)))
+		{
+			got_exception = true;
+			throw Exception("Overflow on bytes to write", ErrorCodes::LOGICAL_ERROR);
+		}
+		bytes_to_write += iov[i].iov_len;
 	}
 
 	/// Send requests (1 syscall).
 	request.aio_lio_opcode = IOCB_CMD_PWRITEV;
 	request.aio_fildes = fd;
 	request.aio_buf = reinterpret_cast<UInt64>(iov);
-	request.aio_nbytes = i + 1;
+	request.aio_nbytes = i;
 	request.aio_offset = region_aligned_begin;
 
 	/// Отправить запрос.
@@ -296,22 +311,17 @@ void WriteBufferAIO::waitForAIOCompletion()
 		is_pending_write = false;
 		off_t bytes_written = events[0].res;
 
-		if ((bytes_written < 0) || (static_cast<size_t>(bytes_written) < flush_buffer.offset()))
+		if (bytes_written < bytes_to_write)
 		{
 			got_exception = true;
 			throw Exception("Asynchronous write error on file " + filename, ErrorCodes::AIO_WRITE_ERROR);
 		}
-		if (pos_in_file > (std::numeric_limits<off_t>::max() - bytes_written))
-		{
-			got_exception = true;
-			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
-		}
-
-		bytes_written -= truncate_count;
 
 		// Delete the trailing zeroes that were added for alignment purposes.
 		if (truncate_count > 0)
 		{
+			bytes_written -= truncate_count;
+
 			int res = ::ftruncate(fd, truncate_count);
 			if (res == -1)
 			{
@@ -320,6 +330,12 @@ void WriteBufferAIO::waitForAIOCompletion()
 			}
 		}
 
+		if (pos_in_file > (std::numeric_limits<off_t>::max() - bytes_written))
+		{
+			got_exception = true;
+			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
+		}
+
 		pos_in_file += bytes_written;
 	}
 }

From 1ef29358d64c7d03c94057ce9072b7b15ed8d970 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Mon, 23 Mar 2015 12:52:13 +0300
Subject: [PATCH 14/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 7667e902876..f1c052ce653 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -266,13 +266,13 @@ read|			+---- left padded disk page             |                      right pad
 	bytes_to_write = 0;
 	for (size_t j = 0; j < i; ++j)
 	{
-		if ((iov[i].iov_len > std::numeric_limits<off_t>::max()) ||
-			(static_cast<off_t>(iov[i].iov_len) > (std::numeric_limits<off_t>::max() - bytes_to_write)))
+		if ((iov[j].iov_len > std::numeric_limits<off_t>::max()) ||
+			(static_cast<off_t>(iov[j].iov_len) > (std::numeric_limits<off_t>::max() - bytes_to_write)))
 		{
 			got_exception = true;
 			throw Exception("Overflow on bytes to write", ErrorCodes::LOGICAL_ERROR);
 		}
-		bytes_to_write += iov[i].iov_len;
+		bytes_to_write += iov[j].iov_len;
 	}
 
 	/// Send requests (1 syscall).

From 06a4b8c5f8e19a138e2b5836b14334128b82d1a9 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 24 Mar 2015 14:03:26 +0300
Subject: [PATCH 15/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/WriteBufferAIO.h    |   5 +-
 dbms/src/IO/WriteBufferAIO.cpp         | 180 ++++++++++++-------------
 dbms/src/IO/tests/write_buffer_aio.cpp |  50 ++++++-
 3 files changed, 142 insertions(+), 93 deletions(-)

diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index caa6791e1e2..1a31772daa3 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -55,8 +55,7 @@ private:
 
 	iovec iov[3];
 
-	Memory left_page{DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE};
-	Memory right_page{DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE};
+	Memory memory_page{DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE};
 
 	const std::string filename;
 
@@ -64,7 +63,9 @@ private:
 	off_t truncate_count = 0;
 
 	off_t pos_in_file = 0;
+	off_t max_pos = 0;
 	int fd = -1;
+	int fd2 = -1;
 
 	/// Асинхронная операция записи ещё не завершилась.
 	bool is_pending_write = false;
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index f1c052ce653..a925d2a6f67 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -17,8 +17,7 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
 {
 	ProfileEvents::increment(ProfileEvents::FileOpen);
 
-	/// About O_RDWR: yep, we really mean it.
-	int open_flags = (flags_ == -1) ? (O_RDWR | O_TRUNC | O_CREAT) : flags_;
+	int open_flags = (flags_ == -1) ? (O_WRONLY | O_TRUNC | O_CREAT) : flags_;
 	open_flags |= O_DIRECT;
 
 	fd = ::open(filename.c_str(), open_flags, mode_);
@@ -29,6 +28,14 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
 		throwFromErrno("Cannot open file " + filename, error_code);
 	}
 
+	fd2 = ::open(filename.c_str(), O_RDONLY, mode_);
+	if (fd2 == -1)
+	{
+		got_exception = true;
+		auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE;
+		throwFromErrno("Cannot open file " + filename, error_code);
+	}
+
 	::memset(&request, 0, sizeof(request));
 }
 
@@ -135,8 +142,6 @@ void WriteBufferAIO::nextImpl()
 
 	truncate_count = 0;
 
-	/// Input parameters: fd, pos_in_file, flush_buffer
-
 	/*
 			region_aligned_begin     region_begin                             region_end      region_aligned_end
 			|                           |                                          |                           |
@@ -149,7 +154,7 @@ void WriteBufferAIO::nextImpl()
 	|		|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|       |
 	|		+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+       |
 (1)	|           |                                       ^                                                 |            |(1)
-read|			+---- left padded disk page             |                      right padded disk page ----+            |read
+read|			+---- region left padding               |                      region right padding   ----+            |read
     |                                                   |                                                              |
 	|		+--------+ (left padded page)               |                        (right padded page)  +--------+       |
 	|		|XXXXX*YY|                                  |                                             |ZZZ*XXXX|       |
@@ -173,92 +178,85 @@ read|			+---- left padded disk page             |                      right pad
 
 	 */
 
-	//
-	// 1. Determine the enclosing page-aligned disk region.
-	//
+	/// Регион диска, на который хотим записать данные.
+	off_t region_begin = pos_in_file;
+	off_t region_end = pos_in_file + flush_buffer.offset();
+	size_t region_size = region_end - region_begin;
 
-	/// Disk region we want to write to.
-	size_t region_begin = pos_in_file;
-	size_t region_end = pos_in_file + flush_buffer.offset();
+	/// Регион диска, на который действительно записываем данные.
+	size_t region_left_padding = region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE;
+	size_t region_right_padding = 0;
+	if (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE != 0)
+		region_right_padding = DEFAULT_AIO_FILE_BLOCK_SIZE - (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE);
 
-	/// Page-aligned disk region.
-	size_t region_aligned_begin = region_begin - (region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE);
-	size_t region_aligned_end = region_end;
-	if ((region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE) != 0)
-		region_aligned_end += DEFAULT_AIO_FILE_BLOCK_SIZE - (region_aligned_end % DEFAULT_AIO_FILE_BLOCK_SIZE);
+	off_t region_aligned_begin = region_begin - region_left_padding;
+	off_t region_aligned_end = region_end + region_right_padding;
+	size_t region_aligned_size = region_aligned_end - region_aligned_begin;
 
-	bool has_left_padding = (region_aligned_begin < region_begin);
-	bool has_right_padding = (region_aligned_end > region_end);
-
-	//
-	// 2. Read needed data from disk into padded pages.
-	//
-
-	if (has_left_padding)
-	{
-		/// Left-side padding disk region.
-		::memset(&left_page[0], 0, left_page.size());
-		ssize_t read_count = ::pread(fd, &left_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, region_aligned_begin);
-		if (read_count < 0)
-			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
-	}
-	if (has_right_padding)
-	{
-		/// Right-side padding disk region.
-		::memset(&right_page[0], 0, right_page.size());
-		ssize_t read_count = ::pread(fd, &right_page[0], DEFAULT_AIO_FILE_BLOCK_SIZE, (region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE));
-		if (read_count < 0)
-			throw Exception("Read error", ErrorCodes::AIO_WRITE_ERROR);
-		truncate_count = DEFAULT_AIO_FILE_BLOCK_SIZE - read_count;
-	}
-
-	//
-	// 3. Copy padding data (2 user-space copies) from the buffer into the padded pages.
-	//
-
-	/// Buffer we want to write to disk.
+	/// Буфер данных, которые хотим записать на диск.
 	Position buffer_begin = flush_buffer.buffer().begin();
-	Position buffer_end = buffer_begin + flush_buffer.offset();
+	Position buffer_end = buffer_begin + region_size;
+	size_t buffer_size = buffer_end - buffer_begin;
+	size_t buffer_capacity = flush_buffer.buffer().size();
 
-	/// Subset of the buffer that is page-aligned.
-	Position aligned_buffer_begin = buffer_begin;
-	Position aligned_buffer_end = buffer_end;
+	/// Обработать буфер, чтобы он оторажал структуру региона диска.
 
-	if (has_left_padding)
+	// Process the left side.
+	bool has_excess_buffer = false;
+	if (region_left_padding > 0)
 	{
-		size_t left_page_unmodified_size = region_begin - region_aligned_begin;
-		size_t left_page_modified_size = DEFAULT_AIO_FILE_BLOCK_SIZE - left_page_unmodified_size;
-		aligned_buffer_begin += left_page_modified_size;
-		::memcpy(&left_page[0] + left_page_unmodified_size, buffer_begin, left_page_modified_size);
-	}
-	if (has_right_padding)
-	{
-		size_t right_page_begin = region_aligned_end - DEFAULT_AIO_FILE_BLOCK_SIZE;
-		size_t right_page_modified_size = region_end - right_page_begin;
-		aligned_buffer_end -= right_page_modified_size;
-		::memcpy(&right_page[0], (buffer_end - right_page_modified_size), right_page_modified_size);
+		if ((region_left_padding + buffer_size) > buffer_capacity)
+		{
+			has_excess_buffer = true;
+			::memset(&memory_page[0], 0, memory_page.size());
+			::memcpy(&memory_page[0], buffer_end - region_left_padding, region_left_padding);
+			buffer_end = buffer_begin + buffer_capacity;
+			buffer_size = buffer_capacity;
+		}
+		else
+		{
+			buffer_size += region_left_padding;
+			buffer_end = buffer_begin + buffer_size;
+		}
+
+		::memmove(buffer_begin + region_left_padding, buffer_begin, buffer_size - region_left_padding);
+		::memset(buffer_begin, 0, region_left_padding);
+
+		ssize_t read_count = ::pread(fd2, buffer_begin, region_left_padding, region_aligned_begin);
+		if (read_count < 0)
+		{
+			got_exception = true;
+			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
+		}
 	}
 
-	//
-	// 4. Create requests.
-	//
+	Position end_ptr;
+	if (has_excess_buffer)
+		end_ptr = &memory_page[region_left_padding];
+	else
+		end_ptr = buffer_end;
 
-	size_t i = 0;
-
-	if (has_left_padding)
+	// Process the right side.
+	if (region_right_padding > 0)
 	{
-		iov[i].iov_base = &left_page[0];
-		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
-		++i;
+		::memset(end_ptr, 0, region_right_padding);
+		ssize_t read_count = ::pread(fd2, end_ptr, region_right_padding, region_end);
+		if (read_count < 0)
+			read_count = 0;
+		truncate_count = DEFAULT_AIO_FILE_BLOCK_SIZE - (region_left_padding + read_count);
 	}
 
-	iov[i].iov_base = aligned_buffer_begin;
-	iov[i].iov_len = aligned_buffer_end - aligned_buffer_begin;
+	/// Создать запрос на асинхронную запись.
+
+	size_t i =  0;
+
+	iov[i].iov_base = buffer_begin;
+	iov[i].iov_len = (has_excess_buffer ? buffer_capacity : region_aligned_size);
 	++i;
 
-	if (has_right_padding)
+	if (has_excess_buffer)
 	{
-		iov[i].iov_base = &right_page[0];
+		iov[i].iov_base = &memory_page[0];
 		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
 		++i;
 	}
@@ -275,7 +273,6 @@ read|			+---- left padded disk page             |                      right pad
 		bytes_to_write += iov[j].iov_len;
 	}
 
-	/// Send requests (1 syscall).
 	request.aio_lio_opcode = IOCB_CMD_PWRITEV;
 	request.aio_fildes = fd;
 	request.aio_buf = reinterpret_cast<UInt64>(iov);
@@ -317,26 +314,29 @@ void WriteBufferAIO::waitForAIOCompletion()
 			throw Exception("Asynchronous write error on file " + filename, ErrorCodes::AIO_WRITE_ERROR);
 		}
 
-		// Delete the trailing zeroes that were added for alignment purposes.
-		if (truncate_count > 0)
-		{
-			bytes_written -= truncate_count;
-
-			int res = ::ftruncate(fd, truncate_count);
-			if (res == -1)
-			{
-				got_exception = true;
-				throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
-			}
-		}
-
+		bytes_written -= truncate_count;
 		if (pos_in_file > (std::numeric_limits<off_t>::max() - bytes_written))
 		{
 			got_exception = true;
 			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
 		}
 
-		pos_in_file += bytes_written;
+		off_t delta = pos_in_file - request.aio_offset;
+		pos_in_file += bytes_written - delta;
+
+		if (pos_in_file > max_pos)
+			max_pos = pos_in_file;
+
+		if (truncate_count > 0)
+		{
+			// Delete the trailing zeroes that were added for alignment purposes.
+			int res = ::ftruncate(fd, max_pos);
+			if (res == -1)
+			{
+				got_exception = true;
+				throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
+			}
+		}
 	}
 }
 
diff --git a/dbms/src/IO/tests/write_buffer_aio.cpp b/dbms/src/IO/tests/write_buffer_aio.cpp
index 3dbdbd0e522..5f13361a297 100644
--- a/dbms/src/IO/tests/write_buffer_aio.cpp
+++ b/dbms/src/IO/tests/write_buffer_aio.cpp
@@ -19,6 +19,7 @@ bool test1();
 bool test2();
 bool test3();
 bool test4();
+bool test5();
 
 void run()
 {
@@ -27,7 +28,8 @@ void run()
 		test1,
 		test2,
 		test3,
-		test4
+		test4,
+		test5
 	};
 
 	unsigned int num = 0;
@@ -282,6 +284,52 @@ bool test4()
 	return true;
 }
 
+bool test5()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	size_t n = 10 * DEFAULT_AIO_FILE_BLOCK_SIZE;
+
+	std::string buf;
+	buf.reserve(n);
+
+	for (size_t i = 0; i < n; ++i)
+		buf += symbols[i % symbols.length()];
+
+	{
+		DB::WriteBufferAIO out(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(1, SEEK_SET);
+		out.write(&buf[0], buf.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	in.close();
+	fs::remove_all(directory);
+
+	return received.substr(1) == buf;
+}
+
 }
 
 int main()

From 1d0bd25480fd69bd05888d8e534985bf820ec922 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 24 Mar 2015 14:05:49 +0300
Subject: [PATCH 16/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 36 ----------------------------------
 1 file changed, 36 deletions(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index a925d2a6f67..d2bd7f43aee 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -142,42 +142,6 @@ void WriteBufferAIO::nextImpl()
 
 	truncate_count = 0;
 
-	/*
-			region_aligned_begin     region_begin                             region_end      region_aligned_end
-			|                           |                                          |                           |
-			|     +---------------------+                                          +----------------------+    |
-			|     |                                                                                       |    |
-			+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-			|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|
-	+-------|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|-------+
-	|		|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|       |
-	|		|XXXXX*  :        :        :        :        :        :        :        :        :        :   *XXXX|       |
-	|		+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+       |
-(1)	|           |                                       ^                                                 |            |(1)
-read|			+---- region left padding               |                      region right padding   ----+            |read
-    |                                                   |                                                              |
-	|		+--------+ (left padded page)               |                        (right padded page)  +--------+       |
-	|		|XXXXX*YY|                                  |                                             |ZZZ*XXXX|       |
-	|		|XXXXX*YY|--------------------------------->+<--------------------------------------------|ZZZ*XXXX|       |
-	+------>|XXXXX*YY|<--+                              | (3) scattered write           +------------>|ZZZ*XXXX|<------+
-			|XXXXX*YY|   |                              |                               |             |ZZZ*XXXX|
-			+--------+   |(2)copy                       |                               |(2)copy      +--------+
-                         |                              |                               |
-	+--------------------+                              |                               +--------------------+
-	|                                                   |                                                    |
-	|		buffer_begin     aligned_buffer_begin.......+...........aligned_buffer_end     buffer_end        |
-	|		|                        |                                   |                          |        |
-	|		|  +---------------------+                                   +----------------------+   |        |
-	|		|  |                                                                                |   |        |
-	|		---+--------+--------+--------+--------+--------+--------+--------+--------+--------+----        |
-	|		*YY:        :        :        :        :        :        :        :        :        :ZZZ*        |
-	|		*YY:        :        :        :        :        :        :        :        :        :ZZZ*        |
-	+-------*YY:        :        :        :        :        :        :        :        :        :ZZZ*--------+
-			*YY:        :        :        :        :        :        :        :        :        :ZZZ*
-			---+--------+--------+--------+--------+--------+--------+--------+--------+--------+----
-
-	 */
-
 	/// Регион диска, на который хотим записать данные.
 	off_t region_begin = pos_in_file;
 	off_t region_end = pos_in_file + flush_buffer.offset();

From 33460bb5ea4832db9719e184d2c2b5d991efb93c Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Tue, 24 Mar 2015 17:01:53 +0300
Subject: [PATCH 17/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/tests/write_buffer_aio.cpp | 63 +++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/dbms/src/IO/tests/write_buffer_aio.cpp b/dbms/src/IO/tests/write_buffer_aio.cpp
index 5f13361a297..f5f8ee60ccd 100644
--- a/dbms/src/IO/tests/write_buffer_aio.cpp
+++ b/dbms/src/IO/tests/write_buffer_aio.cpp
@@ -20,6 +20,7 @@ bool test2();
 bool test3();
 bool test4();
 bool test5();
+bool test6();
 
 void run()
 {
@@ -29,7 +30,8 @@ void run()
 		test2,
 		test3,
 		test4,
-		test5
+		test5,
+		test6
 	};
 
 	unsigned int num = 0;
@@ -330,6 +332,65 @@ bool test5()
 	return received.substr(1) == buf;
 }
 
+bool test6()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	size_t n = 10 * DEFAULT_AIO_FILE_BLOCK_SIZE;
+
+	std::string buf;
+	buf.reserve(n);
+
+	for (size_t i = 0; i < n; ++i)
+		buf += symbols[i % symbols.length()];
+
+	std::string buf2 = "1111111111";
+
+	{
+		DB::WriteBufferAIO out(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(3, SEEK_SET);
+		out.write(&buf[0], buf.length());
+		out.seek(-2 * DEFAULT_AIO_FILE_BLOCK_SIZE, SEEK_CUR);
+		out.write(&buf2[0], buf2.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	in.close();
+	fs::remove_all(directory);
+
+	if (received.substr(3, 8 * DEFAULT_AIO_FILE_BLOCK_SIZE) != buf.substr(0, 8 * DEFAULT_AIO_FILE_BLOCK_SIZE))
+		return false;
+
+	if (received.substr(3 + 8 * DEFAULT_AIO_FILE_BLOCK_SIZE, 10) != buf2)
+		return false;
+
+	if (received.substr(13 + 8 * DEFAULT_AIO_FILE_BLOCK_SIZE) != buf.substr(10 + 8 * DEFAULT_AIO_FILE_BLOCK_SIZE))
+		return false;
+
+	return true;
+}
+
 }
 
 int main()

From f0d6526834a7638c42dd60c92f5b61fa92568b11 Mon Sep 17 00:00:00 2001
From: Pavel Kartavyy <kartavyy@yandex-team.ru>
Date: Tue, 24 Mar 2015 18:28:02 +0300
Subject: [PATCH 18/55] init script: sudo is not required for status

---
 tools/init.d/template | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tools/init.d/template b/tools/init.d/template
index ca290798493..171ac9aaa6d 100755
--- a/tools/init.d/template
+++ b/tools/init.d/template
@@ -217,17 +217,6 @@ main()
 	stop)
 		disable_cron && stop
 		;;
-	status)
-		if [[ $(running_processes) -eq $NUMBER_OF_PROCESSES ]]; then
-			echo "$PROGRAM service is running"
-		else
-			if is_cron_disabled; then
-				echo "$PROGRAM service is stopped";
-			else
-				echo "$PROGRAM: $(($NUMBER_OF_PROCESSES - $(running_processes))) of $NUMBER_OF_PROCESSES processes unexpectedly terminated"
-			fi
-		fi
-		;;
 	restart)
 		restart && enable_cron
 		;;
@@ -260,6 +249,23 @@ main()
 	exit $EXIT_STATUS
 }
 
+status()
+{
+	if [[ $(running_processes) -eq $NUMBER_OF_PROCESSES ]]; then
+		echo "$PROGRAM service is running"
+	else
+		if is_cron_disabled; then
+			echo "$PROGRAM service is stopped";
+		else
+			echo "$PROGRAM: $(($NUMBER_OF_PROCESSES - $(running_processes))) of $NUMBER_OF_PROCESSES processes unexpectedly terminated"
+		fi
+	fi
+}
+
+if [[ "$1" == "status" ]]; then
+	status
+	exit 0
+fi
 (
 	if flock -n 9; then
 		main "$@"

From db117fe7e8ad5c59b18ebc30a44288f3ff101129 Mon Sep 17 00:00:00 2001
From: Pavel Kartavyy <kartavyy@yandex-team.ru>
Date: Wed, 25 Mar 2015 14:36:46 +0300
Subject: [PATCH 19/55] init script: sudo is not required for status

---
 tools/init.d/template | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tools/init.d/template b/tools/init.d/template
index 171ac9aaa6d..20b61081705 100755
--- a/tools/init.d/template
+++ b/tools/init.d/template
@@ -34,6 +34,12 @@ PIDDIR=/var/run/$PROGRAM
 PIDFILE_PREFIX=$PIDDIR/$PROGRAM
 PIDFILE_RE="$PIDFILE_PREFIX[0-9]*.pid"
 
+SUPPORTED_COMMANDS="{start|stop|status|restart|forcestop|forcerestart|reload|condstart|condstop|condrestart|condreload}"
+is_supported_command()
+{
+	echo $SUPPORTED_COMMANDS | grep -E "(\{|\|)$1(\||})" &> /dev/null
+}
+
 generate_program_name()
 {
 	if [ $NUMBER_OF_PROCESSES -eq 1 ]; then
@@ -241,9 +247,6 @@ main()
 	condreload)
 		any_runs && restart
 		;;
-	*)
-		echo "Usage: ${0##*/} {start|stop|status|restart|forcestop|forcerestart|reload|condstart|condstop|condrestart|condreload}"
-		EXIT_STATUS=2
 	esac
 
 	exit $EXIT_STATUS
@@ -262,10 +265,17 @@ status()
 	fi
 }
 
+# выполняем команды, не нуждающиеся в блокировке
+if ! is_supported_command "$1"; then
+	echo "Usage: ${0##*/} $SUPPORTED_COMMANDS"
+	exit 2
+fi
+
 if [[ "$1" == "status" ]]; then
 	status
 	exit 0
 fi
+
 (
 	if flock -n 9; then
 		main "$@"

From 9f6d66a28533ade41a7cd44cad7b878e1b4e3a48 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 17:14:06 +0300
Subject: [PATCH 20/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp         |  40 +++--
 dbms/src/IO/tests/write_buffer_aio.cpp | 223 ++++++++++++++++++++++++-
 2 files changed, 244 insertions(+), 19 deletions(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index d2bd7f43aee..7315166fd3f 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -93,6 +93,9 @@ off_t WriteBufferAIO::seek(off_t off, int whence)
 		throw Exception("WriteBufferAIO::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
 	}
 
+	if (pos_in_file > max_pos)
+		max_pos = pos_in_file;
+
 	return pos_in_file;
 }
 
@@ -142,12 +145,12 @@ void WriteBufferAIO::nextImpl()
 
 	truncate_count = 0;
 
-	/// Регион диска, на который хотим записать данные.
+	/// Регион диска, в который хотим записать данные.
 	off_t region_begin = pos_in_file;
 	off_t region_end = pos_in_file + flush_buffer.offset();
 	size_t region_size = region_end - region_begin;
 
-	/// Регион диска, на который действительно записываем данные.
+	/// Регион диска, в который действительно записываем данные.
 	size_t region_left_padding = region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE;
 	size_t region_right_padding = 0;
 	if (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE != 0)
@@ -165,15 +168,14 @@ void WriteBufferAIO::nextImpl()
 
 	/// Обработать буфер, чтобы он оторажал структуру региона диска.
 
-	// Process the left side.
-	bool has_excess_buffer = false;
+	size_t excess = 0;
 	if (region_left_padding > 0)
 	{
 		if ((region_left_padding + buffer_size) > buffer_capacity)
 		{
-			has_excess_buffer = true;
+			excess = region_left_padding + buffer_size - buffer_capacity;
 			::memset(&memory_page[0], 0, memory_page.size());
-			::memcpy(&memory_page[0], buffer_end - region_left_padding, region_left_padding);
+			::memcpy(&memory_page[0], buffer_end - excess, excess);
 			buffer_end = buffer_begin + buffer_capacity;
 			buffer_size = buffer_capacity;
 		}
@@ -194,20 +196,22 @@ void WriteBufferAIO::nextImpl()
 		}
 	}
 
-	Position end_ptr;
-	if (has_excess_buffer)
-		end_ptr = &memory_page[region_left_padding];
-	else
-		end_ptr = buffer_end;
-
-	// Process the right side.
 	if (region_right_padding > 0)
 	{
+		Position end_ptr;
+		if (excess > 0)
+			end_ptr = &memory_page[excess];
+		else
+			end_ptr = buffer_end;
+
 		::memset(end_ptr, 0, region_right_padding);
 		ssize_t read_count = ::pread(fd2, end_ptr, region_right_padding, region_end);
 		if (read_count < 0)
-			read_count = 0;
-		truncate_count = DEFAULT_AIO_FILE_BLOCK_SIZE - (region_left_padding + read_count);
+		{
+			got_exception = true;
+			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
+		}
+		truncate_count = region_right_padding - read_count;
 	}
 
 	/// Создать запрос на асинхронную запись.
@@ -215,13 +219,13 @@ void WriteBufferAIO::nextImpl()
 	size_t i =  0;
 
 	iov[i].iov_base = buffer_begin;
-	iov[i].iov_len = (has_excess_buffer ? buffer_capacity : region_aligned_size);
+	iov[i].iov_len = ((excess > 0) ? buffer_capacity : region_aligned_size);
 	++i;
 
-	if (has_excess_buffer)
+	if (excess > 0)
 	{
 		iov[i].iov_base = &memory_page[0];
-		iov[i].iov_len = DEFAULT_AIO_FILE_BLOCK_SIZE;
+		iov[i].iov_len = memory_page.size();
 		++i;
 	}
 
diff --git a/dbms/src/IO/tests/write_buffer_aio.cpp b/dbms/src/IO/tests/write_buffer_aio.cpp
index f5f8ee60ccd..799c29dd8c9 100644
--- a/dbms/src/IO/tests/write_buffer_aio.cpp
+++ b/dbms/src/IO/tests/write_buffer_aio.cpp
@@ -21,6 +21,10 @@ bool test3();
 bool test4();
 bool test5();
 bool test6();
+bool test7();
+bool test8();
+bool test9();
+bool test10();
 
 void run()
 {
@@ -31,7 +35,11 @@ void run()
 		test3,
 		test4,
 		test5,
-		test6
+		test6,
+		test7,
+		test8,
+		test9,
+		test10
 	};
 
 	unsigned int num = 0;
@@ -391,6 +399,219 @@ bool test6()
 	return true;
 }
 
+bool test7()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	size_t n = DEFAULT_AIO_FILE_BLOCK_SIZE;
+
+	std::string buf;
+	buf.reserve(n);
+
+	for (size_t i = 0; i < n; ++i)
+		buf += symbols[i % symbols.length()];
+
+	std::string buf2 = "1111111111";
+
+	{
+		DB::WriteBufferAIO out(filename, DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(3, SEEK_SET);
+		out.write(&buf[0], buf.length());
+		out.seek(3, SEEK_CUR);
+		out.write(&buf2[0], buf2.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	if (received.length() != (6 + buf.length() + buf2.length()))
+		return false;
+	if (received.substr(0, 3) != std::string(3, '\0'))
+		return false;
+	if (received.substr(3, buf.length()) != buf)
+		return false;
+	if (received.substr(3 + buf.length(), 3) != std::string(3, '\0'))
+		return false;
+	if (received.substr(6 + buf.length()) != buf2)
+		return false;
+
+	in.close();
+	fs::remove_all(directory);
+
+	return true;
+}
+
+bool test8()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	std::string buf2 = "11111111112222222222";
+
+	{
+		// Minimal buffer size = 2 pages.
+		DB::WriteBufferAIO out(filename, 2 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(DEFAULT_AIO_FILE_BLOCK_SIZE - (buf2.length() / 2), SEEK_SET);
+		out.write(&buf2[0], buf2.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	if (received.length() != 4106)
+		return false;
+	if (received.substr(0, 4086) != std::string(4086, '\0'))
+		return false;
+	if (received.substr(4086, 20) != buf2)
+		return false;
+
+	in.close();
+	fs::remove_all(directory);
+
+	return true;
+}
+
+bool test9()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	std::string buf2 = "11111111112222222222";
+
+	{
+		// Minimal buffer size = 2 pages.
+		DB::WriteBufferAIO out(filename, 2 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(2 * DEFAULT_AIO_FILE_BLOCK_SIZE - (buf2.length() / 2), SEEK_SET);
+		out.write(&buf2[0], buf2.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	if (received.length() != 8202)
+		return false;
+	if (received.substr(0, 8182) != std::string(8182, '\0'))
+		return false;
+	if (received.substr(8182, 20) != buf2)
+		return false;
+
+	in.close();
+	fs::remove_all(directory);
+
+	return true;
+}
+
+bool test10()
+{
+	namespace fs = boost::filesystem;
+
+	static const std::string symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+	char pattern[] = "/tmp/fileXXXXXX";
+	char * dir = ::mkdtemp(pattern);
+	if (dir == nullptr)
+		die("Could not create directory");
+
+	const std::string directory = std::string(dir);
+	const std::string filename = directory + "/foo";
+
+	size_t n = 3 * DEFAULT_AIO_FILE_BLOCK_SIZE;
+
+	std::string buf;
+	buf.reserve(n);
+
+	for (size_t i = 0; i < n; ++i)
+		buf += symbols[i % symbols.length()];
+
+	std::string buf2(DEFAULT_AIO_FILE_BLOCK_SIZE + 10, '1');
+
+	{
+		DB::WriteBufferAIO out(filename, 2 * DEFAULT_AIO_FILE_BLOCK_SIZE);
+
+		if (out.getFileName() != filename)
+			return false;
+		if (out.getFD() == -1)
+			return false;
+
+		out.seek(3, SEEK_SET);
+		out.write(&buf[0], buf.length());
+		out.seek(-DEFAULT_AIO_FILE_BLOCK_SIZE, SEEK_CUR);
+		out.write(&buf2[0], buf2.length());
+	}
+
+	std::ifstream in(filename.c_str());
+	if (!in.is_open())
+		die("Could not open file");
+
+	std::string received{ std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>() };
+
+	in.close();
+	fs::remove_all(directory);
+
+	if (received.substr(3, 2 * DEFAULT_AIO_FILE_BLOCK_SIZE) != buf.substr(0, 2 * DEFAULT_AIO_FILE_BLOCK_SIZE))
+		return false;
+
+	if (received.substr(3 + 2 * DEFAULT_AIO_FILE_BLOCK_SIZE, DEFAULT_AIO_FILE_BLOCK_SIZE + 10) != buf2)
+		return false;
+
+	return true;
+}
+
 }
 
 int main()

From 6789e6327c366cbbd6b9e66469dd50d2374d0f6f Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 17:15:50 +0300
Subject: [PATCH 21/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 7315166fd3f..64a853c1973 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -55,6 +55,8 @@ WriteBufferAIO::~WriteBufferAIO()
 
 	if (fd != -1)
 		::close(fd);
+	if (fd2 != -1)
+		::close(fd2);
 }
 
 off_t WriteBufferAIO::seek(off_t off, int whence)

From 3c65c6d1c5450b4b4af59b3c1cc282cd3afac1d9 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 17:17:51 +0300
Subject: [PATCH 22/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 64a853c1973..6c7a2267514 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -168,7 +168,7 @@ void WriteBufferAIO::nextImpl()
 	size_t buffer_size = buffer_end - buffer_begin;
 	size_t buffer_capacity = flush_buffer.buffer().size();
 
-	/// Обработать буфер, чтобы он оторажал структуру региона диска.
+	/// Обработать буфер, чтобы он отражал структуру региона диска.
 
 	size_t excess = 0;
 	if (region_left_padding > 0)

From 3a2fa12708b1a397e34ab04f0910f039fcee9dca Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 17:44:55 +0300
Subject: [PATCH 23/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 6c7a2267514..1f1fef245e5 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -176,8 +176,8 @@ void WriteBufferAIO::nextImpl()
 		if ((region_left_padding + buffer_size) > buffer_capacity)
 		{
 			excess = region_left_padding + buffer_size - buffer_capacity;
-			::memset(&memory_page[0], 0, memory_page.size());
 			::memcpy(&memory_page[0], buffer_end - excess, excess);
+			::memset(&memory_page[excess], 0, memory_page.size() - excess);
 			buffer_end = buffer_begin + buffer_capacity;
 			buffer_size = buffer_capacity;
 		}
@@ -188,7 +188,6 @@ void WriteBufferAIO::nextImpl()
 		}
 
 		::memmove(buffer_begin + region_left_padding, buffer_begin, buffer_size - region_left_padding);
-		::memset(buffer_begin, 0, region_left_padding);
 
 		ssize_t read_count = ::pread(fd2, buffer_begin, region_left_padding, region_aligned_begin);
 		if (read_count < 0)
@@ -196,24 +195,29 @@ void WriteBufferAIO::nextImpl()
 			got_exception = true;
 			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
 		}
+
+		::memset(buffer_begin + read_count, 0, region_left_padding - read_count);
 	}
 
 	if (region_right_padding > 0)
 	{
-		Position end_ptr;
+		Position from;
 		if (excess > 0)
-			end_ptr = &memory_page[excess];
+			from = &memory_page[excess];
 		else
-			end_ptr = buffer_end;
+			from = buffer_end;
 
-		::memset(end_ptr, 0, region_right_padding);
-		ssize_t read_count = ::pread(fd2, end_ptr, region_right_padding, region_end);
+		ssize_t read_count = ::pread(fd2, from, region_right_padding, region_end);
 		if (read_count < 0)
 		{
 			got_exception = true;
 			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
 		}
+
 		truncate_count = region_right_padding - read_count;
+
+		if (from == buffer_end)
+			::memset(from + read_count, 0, truncate_count);
 	}
 
 	/// Создать запрос на асинхронную запись.

From 8cdeeaa5a4be444dddecc576349f425592b0860b Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 17:58:23 +0300
Subject: [PATCH 24/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/WriteBufferAIO.h |  7 ++++++-
 dbms/src/IO/WriteBufferAIO.cpp      | 12 +++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 1a31772daa3..1d1f6f6bf77 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -62,9 +62,14 @@ private:
 	off_t bytes_to_write = 0;
 	off_t truncate_count = 0;
 
+	/// Текущая позиция в файле.
 	off_t pos_in_file = 0;
-	off_t max_pos = 0;
+	/// Максимальная достигнутая позиция в файле.
+	off_t max_pos_in_file = 0;
+
+	/// Файловый дескриптор для записи.
 	int fd = -1;
+	/// Файловый дескриптор для чтения. Употребляется для невыровненных записей.
 	int fd2 = -1;
 
 	/// Асинхронная операция записи ещё не завершилась.
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 1f1fef245e5..9b6a85f5a21 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -28,6 +28,8 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
 		throwFromErrno("Cannot open file " + filename, error_code);
 	}
 
+	ProfileEvents::increment(ProfileEvents::FileOpen);
+
 	fd2 = ::open(filename.c_str(), O_RDONLY, mode_);
 	if (fd2 == -1)
 	{
@@ -95,8 +97,8 @@ off_t WriteBufferAIO::seek(off_t off, int whence)
 		throw Exception("WriteBufferAIO::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
 	}
 
-	if (pos_in_file > max_pos)
-		max_pos = pos_in_file;
+	if (pos_in_file > max_pos_in_file)
+		max_pos_in_file = pos_in_file;
 
 	return pos_in_file;
 }
@@ -298,13 +300,13 @@ void WriteBufferAIO::waitForAIOCompletion()
 		off_t delta = pos_in_file - request.aio_offset;
 		pos_in_file += bytes_written - delta;
 
-		if (pos_in_file > max_pos)
-			max_pos = pos_in_file;
+		if (pos_in_file > max_pos_in_file)
+			max_pos_in_file = pos_in_file;
 
 		if (truncate_count > 0)
 		{
 			// Delete the trailing zeroes that were added for alignment purposes.
-			int res = ::ftruncate(fd, max_pos);
+			int res = ::ftruncate(fd, max_pos_in_file);
 			if (res == -1)
 			{
 				got_exception = true;

From 21259c6f3e4f82f29cc6871037f59b8f55a52632 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 18:30:48 +0300
Subject: [PATCH 25/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/src/IO/WriteBufferAIO.cpp | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index 9b6a85f5a21..a6d39b764bf 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -150,29 +150,28 @@ void WriteBufferAIO::nextImpl()
 	truncate_count = 0;
 
 	/// Регион диска, в который хотим записать данные.
-	off_t region_begin = pos_in_file;
-	off_t region_end = pos_in_file + flush_buffer.offset();
-	size_t region_size = region_end - region_begin;
+	const off_t region_begin = pos_in_file;
+	const off_t region_end = pos_in_file + flush_buffer.offset();
+	const size_t region_size = region_end - region_begin;
 
 	/// Регион диска, в который действительно записываем данные.
-	size_t region_left_padding = region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE;
-	size_t region_right_padding = 0;
-	if (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE != 0)
-		region_right_padding = DEFAULT_AIO_FILE_BLOCK_SIZE - (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE);
+	const size_t region_left_padding = region_begin % DEFAULT_AIO_FILE_BLOCK_SIZE;
+	const size_t region_right_padding = (DEFAULT_AIO_FILE_BLOCK_SIZE - (region_end % DEFAULT_AIO_FILE_BLOCK_SIZE)) % DEFAULT_AIO_FILE_BLOCK_SIZE;
 
-	off_t region_aligned_begin = region_begin - region_left_padding;
-	off_t region_aligned_end = region_end + region_right_padding;
-	size_t region_aligned_size = region_aligned_end - region_aligned_begin;
+	const off_t region_aligned_begin = region_begin - region_left_padding;
+	const off_t region_aligned_end = region_end + region_right_padding;
+	const size_t region_aligned_size = region_aligned_end - region_aligned_begin;
 
 	/// Буфер данных, которые хотим записать на диск.
-	Position buffer_begin = flush_buffer.buffer().begin();
+	const Position buffer_begin = flush_buffer.buffer().begin();
 	Position buffer_end = buffer_begin + region_size;
 	size_t buffer_size = buffer_end - buffer_begin;
-	size_t buffer_capacity = flush_buffer.buffer().size();
+	const size_t buffer_capacity = flush_buffer.buffer().size();
 
 	/// Обработать буфер, чтобы он отражал структуру региона диска.
 
 	size_t excess = 0;
+
 	if (region_left_padding > 0)
 	{
 		if ((region_left_padding + buffer_size) > buffer_capacity)
@@ -180,14 +179,12 @@ void WriteBufferAIO::nextImpl()
 			excess = region_left_padding + buffer_size - buffer_capacity;
 			::memcpy(&memory_page[0], buffer_end - excess, excess);
 			::memset(&memory_page[excess], 0, memory_page.size() - excess);
-			buffer_end = buffer_begin + buffer_capacity;
 			buffer_size = buffer_capacity;
 		}
 		else
-		{
 			buffer_size += region_left_padding;
-			buffer_end = buffer_begin + buffer_size;
-		}
+
+		buffer_end = buffer_begin + buffer_size;
 
 		::memmove(buffer_begin + region_left_padding, buffer_begin, buffer_size - region_left_padding);
 

From cac04c89b223daa369a21e44b33c65d23ab7b379 Mon Sep 17 00:00:00 2001
From: Pavel Kartavyy <kartavyy@yandex-team.ru>
Date: Wed, 25 Mar 2015 18:49:11 +0300
Subject: [PATCH 26/55] init script: fix warning when daemon installed first
 time

---
 tools/init.d/template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/init.d/template b/tools/init.d/template
index 20b61081705..cbb87aaed65 100755
--- a/tools/init.d/template
+++ b/tools/init.d/template
@@ -72,7 +72,7 @@ specific_log_file_for_each_process()
 
 find_pid_files()
 {
-	find $PIDDIR -regex "$PIDFILE_RE"
+	[[ -e $PIDDIR ]] && find $PIDDIR -regex "$PIDFILE_RE"
 }
 
 is_running()

From ae5fd4dbc9ed3a0419902a8eac2977ff6d6e4133 Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Wed, 25 Mar 2015 19:44:30 +0300
Subject: [PATCH 27/55] dbms: Server: feature development. [#METR-15090]

---
 dbms/include/DB/IO/WriteBufferAIO.h |  2 +-
 dbms/src/IO/WriteBufferAIO.cpp      | 92 ++++++++++++++---------------
 2 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/dbms/include/DB/IO/WriteBufferAIO.h b/dbms/include/DB/IO/WriteBufferAIO.h
index 1d1f6f6bf77..5fed79c17d0 100644
--- a/dbms/include/DB/IO/WriteBufferAIO.h
+++ b/dbms/include/DB/IO/WriteBufferAIO.h
@@ -60,7 +60,7 @@ private:
 	const std::string filename;
 
 	off_t bytes_to_write = 0;
-	off_t truncate_count = 0;
+	off_t truncation_count = 0;
 
 	/// Текущая позиция в файле.
 	off_t pos_in_file = 0;
diff --git a/dbms/src/IO/WriteBufferAIO.cpp b/dbms/src/IO/WriteBufferAIO.cpp
index a6d39b764bf..0103626c0cc 100644
--- a/dbms/src/IO/WriteBufferAIO.cpp
+++ b/dbms/src/IO/WriteBufferAIO.cpp
@@ -147,7 +147,7 @@ void WriteBufferAIO::nextImpl()
 	waitForAIOCompletion();
 	swapBuffers();
 
-	truncate_count = 0;
+	truncation_count = 0;
 
 	/// Регион диска, в который хотим записать данные.
 	const off_t region_begin = pos_in_file;
@@ -170,15 +170,15 @@ void WriteBufferAIO::nextImpl()
 
 	/// Обработать буфер, чтобы он отражал структуру региона диска.
 
-	size_t excess = 0;
+	size_t excess_count = 0;
 
 	if (region_left_padding > 0)
 	{
 		if ((region_left_padding + buffer_size) > buffer_capacity)
 		{
-			excess = region_left_padding + buffer_size - buffer_capacity;
-			::memcpy(&memory_page[0], buffer_end - excess, excess);
-			::memset(&memory_page[excess], 0, memory_page.size() - excess);
+			excess_count = region_left_padding + buffer_size - buffer_capacity;
+			::memcpy(&memory_page[0], buffer_end - excess_count, excess_count);
+			::memset(&memory_page[excess_count], 0, memory_page.size() - excess_count);
 			buffer_size = buffer_capacity;
 		}
 		else
@@ -201,8 +201,8 @@ void WriteBufferAIO::nextImpl()
 	if (region_right_padding > 0)
 	{
 		Position from;
-		if (excess > 0)
-			from = &memory_page[excess];
+		if (excess_count > 0)
+			from = &memory_page[excess_count];
 		else
 			from = buffer_end;
 
@@ -213,10 +213,10 @@ void WriteBufferAIO::nextImpl()
 			throw Exception("Read error", ErrorCodes::AIO_READ_ERROR);
 		}
 
-		truncate_count = region_right_padding - read_count;
+		truncation_count = region_right_padding - read_count;
 
 		if (from == buffer_end)
-			::memset(from + read_count, 0, truncate_count);
+			::memset(from + read_count, 0, truncation_count);
 	}
 
 	/// Создать запрос на асинхронную запись.
@@ -224,10 +224,10 @@ void WriteBufferAIO::nextImpl()
 	size_t i =  0;
 
 	iov[i].iov_base = buffer_begin;
-	iov[i].iov_len = ((excess > 0) ? buffer_capacity : region_aligned_size);
+	iov[i].iov_len = ((excess_count > 0) ? buffer_capacity : region_aligned_size);
 	++i;
 
-	if (excess > 0)
+	if (excess_count > 0)
 	{
 		iov[i].iov_base = &memory_page[0];
 		iov[i].iov_len = memory_page.size();
@@ -267,48 +267,48 @@ void WriteBufferAIO::nextImpl()
 
 void WriteBufferAIO::waitForAIOCompletion()
 {
-	if (is_pending_write)
+	if (!is_pending_write)
+		return;
+
+	while (io_getevents(aio_context.ctx, events.size(), events.size(), &events[0], nullptr) < 0)
 	{
-		while (io_getevents(aio_context.ctx, events.size(), events.size(), &events[0], nullptr) < 0)
-		{
-			if (errno != EINTR)
-			{
-				got_exception = true;
-				throw Exception("Failed to wait for asynchronous IO completion on file " + filename, ErrorCodes::AIO_COMPLETION_ERROR);
-			}
-		}
-
-		is_pending_write = false;
-		off_t bytes_written = events[0].res;
-
-		if (bytes_written < bytes_to_write)
+		if (errno != EINTR)
 		{
 			got_exception = true;
-			throw Exception("Asynchronous write error on file " + filename, ErrorCodes::AIO_WRITE_ERROR);
+			throw Exception("Failed to wait for asynchronous IO completion on file " + filename, ErrorCodes::AIO_COMPLETION_ERROR);
 		}
+	}
 
-		bytes_written -= truncate_count;
-		if (pos_in_file > (std::numeric_limits<off_t>::max() - bytes_written))
+	is_pending_write = false;
+	off_t bytes_written = events[0].res;
+
+	if (bytes_written < bytes_to_write)
+	{
+		got_exception = true;
+		throw Exception("Asynchronous write error on file " + filename, ErrorCodes::AIO_WRITE_ERROR);
+	}
+
+	bytes_written -= truncation_count;
+
+	off_t pos_offset = bytes_written - (pos_in_file - request.aio_offset);
+	if (pos_in_file > (std::numeric_limits<off_t>::max() - pos_offset))
+	{
+		got_exception = true;
+		throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
+	}
+	pos_in_file += pos_offset;
+
+	if (pos_in_file > max_pos_in_file)
+		max_pos_in_file = pos_in_file;
+
+	if (truncation_count > 0)
+	{
+		/// Укоротить файл, чтобы удалить из него излишние нули.
+		int res = ::ftruncate(fd, max_pos_in_file);
+		if (res == -1)
 		{
 			got_exception = true;
-			throw Exception("File position overflowed", ErrorCodes::LOGICAL_ERROR);
-		}
-
-		off_t delta = pos_in_file - request.aio_offset;
-		pos_in_file += bytes_written - delta;
-
-		if (pos_in_file > max_pos_in_file)
-			max_pos_in_file = pos_in_file;
-
-		if (truncate_count > 0)
-		{
-			// Delete the trailing zeroes that were added for alignment purposes.
-			int res = ::ftruncate(fd, max_pos_in_file);
-			if (res == -1)
-			{
-				got_exception = true;
-				throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
-			}
+			throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
 		}
 	}
 }

From 9cc2e5204744742fc13193711f3eced6065fd93d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 26 Mar 2015 03:38:17 +0300
Subject: [PATCH 28/55] dbms: string search: improved performance in fallback
 case [#METR-15690].

---
 dbms/include/DB/Common/Volnitsky.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/dbms/include/DB/Common/Volnitsky.h b/dbms/include/DB/Common/Volnitsky.h
index c6b5fb8ac5c..d1507f1d3be 100644
--- a/dbms/include/DB/Common/Volnitsky.h
+++ b/dbms/include/DB/Common/Volnitsky.h
@@ -2,7 +2,6 @@
 
 #include <stdint.h>
 #include <string.h>
-#include <algorithm>
 
 
 /** Поиск подстроки в строке по алгоритму Вольницкого:
@@ -40,6 +39,22 @@ private:
 
 	bool fallback;				/// Нужно ли использовать fallback алгоритм.
 
+	/// fallback алгоритм
+	static const char * naive_memmem(const char * haystack, size_t haystack_size, const char * needle, size_t needle_size)
+	{
+		const char * pos = haystack;
+		const char * end = haystack + haystack_size;
+		while (nullptr != (pos = reinterpret_cast<const char *>(memchr(pos, needle[0], end - pos))) && pos + needle_size <= end)
+		{
+			if (0 == memcmp(pos, needle, needle_size))
+				return pos;
+			else
+				++pos;
+		}
+
+		return end;
+	}
+
 public:
 	/** haystack_size_hint - ожидаемый суммарный размер haystack при вызовах search. Можно не указывать.
 	  * Если указать его достаточно маленьким, то будет использован fallback алгоритм,
@@ -83,8 +98,7 @@ public:
 		}
 		if (fallback || haystack_size <= needle_size)
 		{
-			/// Как ни странно, std::search работает намного быстрее memmem из eglibc.
-			return std::search(haystack, haystack_end, needle, needle_end);
+			return naive_memmem(haystack, haystack_size, needle, needle_size);
 		}
 
 		/// Будем "прикладывать" needle к haystack и сравнивать n-грам из конца needle.
@@ -106,7 +120,7 @@ public:
 		}
 
 		/// Оставшийся хвостик.
-		return std::search(pos - step + 1, haystack_end, needle, needle_end);
+		return naive_memmem(pos - step + 1, haystack_end - (pos - step + 1), needle, needle_size);
 	}
 
 	const unsigned char * search(const unsigned char * haystack, size_t haystack_size) const

From f3f935e3d8425dc75bda34a49268d383f9645a43 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 26 Mar 2015 05:54:16 +0300
Subject: [PATCH 29/55] dbms: toDateTime: improvement [#METR-2944].

---
 dbms/include/DB/Functions/FunctionsConversion.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/dbms/include/DB/Functions/FunctionsConversion.h b/dbms/include/DB/Functions/FunctionsConversion.h
index 4c6b41b3545..985bcf5ae97 100644
--- a/dbms/include/DB/Functions/FunctionsConversion.h
+++ b/dbms/include/DB/Functions/FunctionsConversion.h
@@ -134,7 +134,7 @@ struct ConvertImpl<DataTypeDateTime, DataTypeDate, Name>
 };
 
 
-/** Отдельный случай для преобразования UInt32 или UInt64 в Date.
+/** Отдельный случай для преобразования (U)Int32 или (U)Int64 в Date.
   * Если число меньше 65536, то оно понимается, как DayNum, а если больше - как unix timestamp.
   * Немного нелогично, что мы, по сути, помещаем две разные функции в одну.
   * Но зато это позволяет поддержать распространённый случай,
@@ -142,7 +142,7 @@ struct ConvertImpl<DataTypeDateTime, DataTypeDate, Name>
   *  (иначе такое использование было бы распространённой ошибкой).
   */
 template <typename FromDataType, typename Name>
-struct ConvertImplUInt32Or64ToDate
+struct ConvertImpl32Or64ToDate
 {
 	typedef typename FromDataType::FieldType FromFieldType;
 	typedef DataTypeDate::FieldType ToFieldType;
@@ -186,11 +186,10 @@ struct ConvertImplUInt32Or64ToDate
 	}
 };
 
-template <typename Name>
-struct ConvertImpl<DataTypeUInt32, DataTypeDate, Name> : ConvertImplUInt32Or64ToDate<DataTypeUInt32, Name> {};
-
-template <typename Name>
-struct ConvertImpl<DataTypeUInt64, DataTypeDate, Name> : ConvertImplUInt32Or64ToDate<DataTypeUInt64, Name> {};
+template <typename Name> struct ConvertImpl<DataTypeUInt32, DataTypeDate, Name> : ConvertImpl32Or64ToDate<DataTypeUInt32, Name> {};
+template <typename Name> struct ConvertImpl<DataTypeUInt64, DataTypeDate, Name> : ConvertImpl32Or64ToDate<DataTypeUInt64, Name> {};
+template <typename Name> struct ConvertImpl<DataTypeInt32, DataTypeDate, Name> : ConvertImpl32Or64ToDate<DataTypeInt32, Name> {};
+template <typename Name> struct ConvertImpl<DataTypeInt64, DataTypeDate, Name> : ConvertImpl32Or64ToDate<DataTypeInt64, Name> {};
 
 
 /** Преобразование чисел, дат, дат-с-временем в строки: через форматирование.

From 732a80d833f94d9b603465168a8edf4634581c9b Mon Sep 17 00:00:00 2001
From: Alexey Arno <af-arno@yandex-team.ru>
Date: Thu, 26 Mar 2015 17:53:35 +0300
Subject: [PATCH 30/55] dbms: Server: Fixed race condition in
 PoolWithFailoverBase in a more elegant way. [#METR-15531]

---
 dbms/include/DB/Client/ConnectionPoolWithFailover.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/include/DB/Client/ConnectionPoolWithFailover.h b/dbms/include/DB/Client/ConnectionPoolWithFailover.h
index 4057cd51675..ad61ad5c262 100644
--- a/dbms/include/DB/Client/ConnectionPoolWithFailover.h
+++ b/dbms/include/DB/Client/ConnectionPoolWithFailover.h
@@ -98,11 +98,11 @@ private:
 		for (size_t i = 0; i < nested_pools.size(); ++i)
 		{
 			if (load_balancing == LoadBalancing::NEAREST_HOSTNAME)
-				nested_pools[i].priority = hostname_differences[i];
+				nested_pools[i].state.priority = hostname_differences[i];
 			else if (load_balancing == LoadBalancing::RANDOM)
-				nested_pools[i].priority = 0;
+				nested_pools[i].state.priority = 0;
 			else if (load_balancing == LoadBalancing::IN_ORDER)
-				nested_pools[i].priority = i;
+				nested_pools[i].state.priority = i;
 			else
 				throw Exception("Unknown load_balancing_mode: " + toString(static_cast<int>(load_balancing)), ErrorCodes::LOGICAL_ERROR);
 		}

From 72b2c6355744525579b5eab9b8d104cf690253b2 Mon Sep 17 00:00:00 2001
From: Andrey Mironov <hertz@yandex-team.ru>
Date: Thu, 26 Mar 2015 17:57:41 +0300
Subject: [PATCH 31/55] dbms: system.dictionaries: use ext::map for the purpose
 it was designed for. [#METR-15569]

---
 dbms/src/Storages/StorageSystemDictionaries.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/dbms/src/Storages/StorageSystemDictionaries.cpp b/dbms/src/Storages/StorageSystemDictionaries.cpp
index 47819fcd1e7..f32e147fb2f 100644
--- a/dbms/src/Storages/StorageSystemDictionaries.cpp
+++ b/dbms/src/Storages/StorageSystemDictionaries.cpp
@@ -10,6 +10,7 @@
 #include <DB/Dictionaries/IDictionary.h>
 #include <DB/Dictionaries/IDictionarySource.h>
 #include <DB/Dictionaries/DictionaryStructure.h>
+#include <statdaemons/ext/map.hpp>
 #include <mutex>
 
 namespace DB
@@ -87,15 +88,12 @@ BlockInputStreams StorageSystemDictionaries::read(
 		col_origin.column->insert(dict_info.second.second);
 
 		const auto & dict_struct = dict_ptr->getStructure();
-		Array attribute_names;
-		Array attribute_types;
-		for (const auto & attribute : dict_struct.attributes)
-		{
-			attribute_names.push_back(attribute.name);
-			attribute_types.push_back(attribute.type->getName());
-		}
-		col_attribute_names.column->insert(attribute_names);
-		col_attribute_types.column->insert(attribute_types);
+		col_attribute_names.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) {
+			return attr.name;
+		}));
+		col_attribute_types.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) {
+			return attr.type->getName();
+		}));
 		col_has_hierarchy.column->insert(UInt64{dict_ptr->hasHierarchy()});
 		col_bytes_allocated.column->insert(dict_ptr->getBytesAllocated());
 		col_hit_rate.column->insert(dict_ptr->getHitRate());

From bf4bd070651b29f90c9ae623ec63bc0765dbaa81 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 02:32:16 +0300
Subject: [PATCH 32/55] dbms: tiny modifications [#METR-2944].

---
 dbms/src/Interpreters/tests/expression.cpp   | 10 ++++---
 dbms/src/Storages/StorageSystemDatabases.cpp |  3 +-
 dbms/src/Storages/StorageSystemEvents.cpp    |  9 ++++--
 dbms/src/Storages/StorageSystemNumbers.cpp   |  3 +-
 dbms/src/Storages/StorageSystemOne.cpp       |  3 +-
 dbms/src/Storages/StorageSystemParts.cpp     | 31 +++++++++++---------
 dbms/src/Storages/StorageSystemTables.cpp    | 18 +++++++-----
 7 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/dbms/src/Interpreters/tests/expression.cpp b/dbms/src/Interpreters/tests/expression.cpp
index 35446a6b039..85396f665da 100644
--- a/dbms/src/Interpreters/tests/expression.cpp
+++ b/dbms/src/Interpreters/tests/expression.cpp
@@ -59,10 +59,12 @@ int main(int argc, char ** argv)
 		}
 
 		Context context;
-		NamesAndTypesList columns;
-		columns.emplace_back("x", new DataTypeInt16);
-		columns.emplace_back("s1", new DataTypeString);
-		columns.emplace_back("s2", new DataTypeString);
+		NamesAndTypesList columns
+		{
+			{"x", new DataTypeInt16},
+			{"s1", new DataTypeString},
+			{"s2", new DataTypeString}
+		};
 		context.setColumns(columns);
 
 		ExpressionAnalyzer analyzer(ast, context, context.getColumns());
diff --git a/dbms/src/Storages/StorageSystemDatabases.cpp b/dbms/src/Storages/StorageSystemDatabases.cpp
index c9f3d02d37c..9d61eda4287 100644
--- a/dbms/src/Storages/StorageSystemDatabases.cpp
+++ b/dbms/src/Storages/StorageSystemDatabases.cpp
@@ -9,9 +9,8 @@ namespace DB
 
 
 StorageSystemDatabases::StorageSystemDatabases(const std::string & name_)
-	: name(name_)
+	: name(name_), columns{{"name", new DataTypeString}}
 {
-	columns.emplace_back("name", new DataTypeString);
 }
 
 StoragePtr StorageSystemDatabases::create(const std::string & name_)
diff --git a/dbms/src/Storages/StorageSystemEvents.cpp b/dbms/src/Storages/StorageSystemEvents.cpp
index a2532340fe3..0f103e066f7 100644
--- a/dbms/src/Storages/StorageSystemEvents.cpp
+++ b/dbms/src/Storages/StorageSystemEvents.cpp
@@ -11,10 +11,13 @@ namespace DB
 
 
 StorageSystemEvents::StorageSystemEvents(const std::string & name_)
-	: name(name_)
+	: name(name_),
+	columns
+	{
+		{"event", 		new DataTypeString},
+		{"value",		new DataTypeUInt64},
+	}
 {
-	columns.emplace_back("event", 		new DataTypeString);
-	columns.emplace_back("value",		new DataTypeUInt64);
 }
 
 StoragePtr StorageSystemEvents::create(const std::string & name_)
diff --git a/dbms/src/Storages/StorageSystemNumbers.cpp b/dbms/src/Storages/StorageSystemNumbers.cpp
index 26ed58b9ba7..3a41abc363f 100644
--- a/dbms/src/Storages/StorageSystemNumbers.cpp
+++ b/dbms/src/Storages/StorageSystemNumbers.cpp
@@ -54,9 +54,8 @@ private:
 
 
 StorageSystemNumbers::StorageSystemNumbers(const std::string & name_, bool multithreaded_)
-	: name(name_), multithreaded(multithreaded_)
+	: name(name_), columns{{"number", new DataTypeUInt64}}, multithreaded(multithreaded_)
 {
-	columns.emplace_back("number", new DataTypeUInt64);
 }
 
 StoragePtr StorageSystemNumbers::create(const std::string & name_, bool multithreaded_)
diff --git a/dbms/src/Storages/StorageSystemOne.cpp b/dbms/src/Storages/StorageSystemOne.cpp
index 27052985ba4..6ddcce51843 100644
--- a/dbms/src/Storages/StorageSystemOne.cpp
+++ b/dbms/src/Storages/StorageSystemOne.cpp
@@ -12,9 +12,8 @@ namespace DB
 
 
 StorageSystemOne::StorageSystemOne(const std::string & name_)
-	: name(name_)
+	: name(name_), columns{{"dummy", new DataTypeUInt8}}
 {
-	columns.emplace_back("dummy", new DataTypeUInt8);
 }
 
 StoragePtr StorageSystemOne::create(const std::string & name_)
diff --git a/dbms/src/Storages/StorageSystemParts.cpp b/dbms/src/Storages/StorageSystemParts.cpp
index 5606dfbeff6..cfee368a9ab 100644
--- a/dbms/src/Storages/StorageSystemParts.cpp
+++ b/dbms/src/Storages/StorageSystemParts.cpp
@@ -14,21 +14,24 @@ namespace DB
 
 
 StorageSystemParts::StorageSystemParts(const std::string & name_)
-	: name(name_)
-{
-	columns.emplace_back("partition", 			new DataTypeString);
-	columns.emplace_back("name", 				new DataTypeString);
-	columns.emplace_back("replicated",			new DataTypeUInt8);
-	columns.emplace_back("active",				new DataTypeUInt8);
-	columns.emplace_back("marks",				new DataTypeUInt64);
-	columns.emplace_back("bytes",				new DataTypeUInt64);
-	columns.emplace_back("modification_time",	new DataTypeDateTime);
-	columns.emplace_back("remove_time",			new DataTypeDateTime);
-	columns.emplace_back("refcount",			new DataTypeUInt32);
+	: name(name_),
+	columns
+	{
+		{"partition", 			new DataTypeString},
+		{"name", 				new DataTypeString},
+		{"replicated",			new DataTypeUInt8},
+		{"active",				new DataTypeUInt8},
+		{"marks",				new DataTypeUInt64},
+		{"bytes",				new DataTypeUInt64},
+		{"modification_time",	new DataTypeDateTime},
+		{"remove_time",			new DataTypeDateTime},
+		{"refcount",			new DataTypeUInt32},
 
-	columns.emplace_back("database", 			new DataTypeString);
-	columns.emplace_back("table", 				new DataTypeString);
-	columns.emplace_back("engine", 				new DataTypeString);
+		{"database", 			new DataTypeString},
+		{"table", 				new DataTypeString},
+		{"engine", 				new DataTypeString},
+	}
+{
 }
 
 StoragePtr StorageSystemParts::create(const std::string & name_)
diff --git a/dbms/src/Storages/StorageSystemTables.cpp b/dbms/src/Storages/StorageSystemTables.cpp
index f0b2bc65407..dc5c78d2558 100644
--- a/dbms/src/Storages/StorageSystemTables.cpp
+++ b/dbms/src/Storages/StorageSystemTables.cpp
@@ -10,11 +10,14 @@ namespace DB
 
 
 StorageSystemTables::StorageSystemTables(const std::string & name_)
-	: name(name_)
+	: name(name_),
+	columns
+	{
+		{"database", 	new DataTypeString},
+		{"name", 		new DataTypeString},
+		{"engine", 		new DataTypeString},
+	}
 {
-	columns.emplace_back("database", 	new DataTypeString);
-	columns.emplace_back("name", 		new DataTypeString);
-	columns.emplace_back("engine", 		new DataTypeString);
 }
 
 StoragePtr StorageSystemTables::create(const std::string & name_)
@@ -32,10 +35,9 @@ static ColumnWithNameAndType getFilteredDatabases(ASTPtr query, const Context &
 
 	Block block;
 	block.insert(column);
-	for (auto database_it = context.getDatabases().begin(); database_it != context.getDatabases().end(); ++database_it)
-	{
-		column.column->insert(database_it->first);
-	}
+	for (const auto db : context.getDatabases())
+		column.column->insert(db.first);
+
 	VirtualColumnUtils::filterBlockWithQuery(query, block, context);
 
 	return block.getByPosition(0);

From 75dc9db388ab64f4deb73475b5e074c6898c10f6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 02:48:50 +0300
Subject: [PATCH 33/55] dbms: ALTER: writing to log just before waiting for
 lock [#METR-2944].

---
 dbms/src/Storages/StorageReplicatedMergeTree.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp
index f99732892d9..9024008b801 100644
--- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1467,6 +1467,8 @@ void StorageReplicatedMergeTree::alterThread()
 			/// Если описание столбцов изменилось, обновим структуру таблицы локально.
 			if (changed_version)
 			{
+				LOG_INFO(log, "Changed version of 'columns' node in ZooKeeper. Waiting for structure write lock.");
+
 				auto table_lock = lockStructureForAlter();
 
 				const auto columns_changed = columns != data.getColumnsListNonMaterialized();

From 74dc65197a72b66a5b1442fd9c721be5dfadb450 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 02:51:48 +0300
Subject: [PATCH 34/55] dbms: tiny modification [#METR-2944].

---
 dbms/src/Storages/tests/pk_condition.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dbms/src/Storages/tests/pk_condition.cpp b/dbms/src/Storages/tests/pk_condition.cpp
index b67b41d2484..b698d1470c6 100644
--- a/dbms/src/Storages/tests/pk_condition.cpp
+++ b/dbms/src/Storages/tests/pk_condition.cpp
@@ -31,8 +31,7 @@ int main(int argc, const char ** argv)
 		return 1;
 	}
 	Context context;
-	NamesAndTypesList columns;
-	columns.emplace_back("key", new DataTypeUInt64);
+	NamesAndTypesList columns{{"key", new DataTypeUInt64}};
 	SortDescription sort_descr;
 	sort_descr.push_back(SortColumnDescription("key", 1));
 

From 048eed942434159827d8b475c5cd05feaf8ff2e6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 05:12:45 +0300
Subject: [PATCH 35/55] dbms: improved performance of LIKE and regexp search
 [#METR-15690].

---
 .../DB/Functions/FunctionsStringSearch.h      | 92 ++++++++++++++++++-
 .../queries/0_stateless/00139_like.reference  |  3 +
 dbms/tests/queries/0_stateless/00139_like.sql |  5 +
 3 files changed, 95 insertions(+), 5 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/00139_like.reference
 create mode 100644 dbms/tests/queries/0_stateless/00139_like.sql

diff --git a/dbms/include/DB/Functions/FunctionsStringSearch.h b/dbms/include/DB/Functions/FunctionsStringSearch.h
index ffc95a8928d..50c65394f47 100644
--- a/dbms/include/DB/Functions/FunctionsStringSearch.h
+++ b/dbms/include/DB/Functions/FunctionsStringSearch.h
@@ -160,11 +160,16 @@ struct PositionUTF8Impl
 /// Переводит выражение LIKE в regexp re2. Например, abc%def -> ^abc.*def$
 inline String likePatternToRegexp(const String & pattern)
 {
-	String res = "^";
+	String res;
 	res.reserve(pattern.size() * 2);
 	const char * pos = pattern.data();
 	const char * end = pos + pattern.size();
 
+	if (pos < end && *pos == '%')
+		++pos;
+	else
+		res = "^";
+
 	while (pos < end)
 	{
 		switch (*pos)
@@ -174,7 +179,10 @@ inline String likePatternToRegexp(const String & pattern)
 				res += *pos;
 				break;
 			case '%':
-				res += ".*";
+				if (pos + 1 != end)
+					res += ".*";
+				else
+					return res;
 				break;
 			case '_':
 				res += ".";
@@ -347,6 +355,7 @@ struct MatchImpl
 			/// Текущий индекс в массиве строк.
 			size_t i = 0;
 
+			/// TODO Надо сделать так, чтобы searcher был общим на все вызовы функции.
 			Volnitsky searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);
 
 			/// Искать будем следующее вхождение сразу во всех строках.
@@ -369,14 +378,87 @@ struct MatchImpl
 				++i;
 			}
 
+			/// Хвостик, в котором не может быть подстрок.
 			memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
 		}
 		else
 		{
-			const auto & regexp = Regexps::get<like, true>(pattern);
 			size_t size = offsets.size();
-			for (size_t i = 0; i < size; ++i)
-				res[i] = revert ^ regexp->match(reinterpret_cast<const char *>(&data[i != 0 ? offsets[i - 1] : 0]), (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1);
+
+			const auto & regexp = Regexps::get<like, true>(pattern);
+
+			std::string required_substring;
+			bool is_trivial;
+			bool required_substring_is_prefix;	/// для anchored выполнения регекспа.
+
+			regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
+
+			if (required_substring.empty())
+			{
+				size_t prev_offset = 0;
+				for (size_t i = 0; i < size; ++i)
+				{
+					res[i] = revert ^ regexp->getRE2()->Match(
+						re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1),
+						0, offsets[i] - prev_offset - 1, re2_st::RE2::UNANCHORED, nullptr, 0);
+
+					prev_offset = offsets[i];
+				}
+			}
+			else
+			{
+				/// NOTE Это почти совпадает со случаем likePatternIsStrstr.
+
+				const UInt8 * begin = &data[0];
+				const UInt8 * pos = begin;
+				const UInt8 * end = pos + data.size();
+
+				/// Текущий индекс в массиве строк.
+				size_t i = 0;
+
+				Volnitsky searcher(required_substring.data(), required_substring.size(), end - pos);
+
+				/// Искать будем следующее вхождение сразу во всех строках.
+				while (pos < end && end != (pos = searcher.search(pos, end - pos)))
+				{
+					/// Определим, к какому индексу оно относится.
+					while (begin + offsets[i] < pos)
+					{
+						res[i] = revert;
+						++i;
+					}
+
+					/// Проверяем, что вхождение не переходит через границы строк.
+					if (pos + strstr_pattern.size() < begin + offsets[i])
+					{
+						/// И если не переходит - при необходимости, проверяем регекспом.
+
+						if (is_trivial)
+							res[i] = !revert;
+						else
+						{
+							const char * str_data = reinterpret_cast<const char *>(&data[i != 0 ? offsets[i - 1] : 0]);
+							size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;
+
+							if (required_substring_is_prefix)
+								res[i] = revert ^ regexp->getRE2()->Match(
+									re2_st::StringPiece(str_data, str_size),
+									reinterpret_cast<const char *>(pos) - str_data, str_size, re2_st::RE2::ANCHOR_START, nullptr, 0);
+							else
+								res[i] = revert ^ regexp->getRE2()->Match(
+									re2_st::StringPiece(str_data, str_size),
+									0, str_size, re2_st::RE2::UNANCHORED, nullptr, 0);
+						}
+					}
+					else
+						res[i] = revert;
+
+					pos = begin + offsets[i];
+					++i;
+				}
+
+				memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
+			}
 		}
 	}
 
diff --git a/dbms/tests/queries/0_stateless/00139_like.reference b/dbms/tests/queries/0_stateless/00139_like.reference
new file mode 100644
index 00000000000..4f8632b42a7
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00139_like.reference
@@ -0,0 +1,3 @@
+79628
+79628
+102851
diff --git a/dbms/tests/queries/0_stateless/00139_like.sql b/dbms/tests/queries/0_stateless/00139_like.sql
new file mode 100644
index 00000000000..ccc195bc81d
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00139_like.sql
@@ -0,0 +1,5 @@
+/* Заметим, что запросы написаны так, как будто пользователь не понимает смысл символа _ в LIKE выражении. */
+SELECT count() FROM test.hits WHERE URL LIKE '%/avtomobili_s_probegom/_%__%__%__%';
+SELECT count() FROM test.hits WHERE URL LIKE '/avtomobili_s_probegom/_%__%__%__%';
+SELECT count() FROM test.hits WHERE URL LIKE '%_/avtomobili_s_probegom/_%__%__%__%';
+SELECT count() FROM test.hits WHERE URL LIKE '%avtomobili%';

From cfc5b313cc046b72b28c7e4d688fdeb6e978c1fe Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 06:06:06 +0300
Subject: [PATCH 36/55] dbms: fixed 'force_index_by_date' setting
 [#METR-15484].

---
 .../DB/Storages/MergeTree/PKCondition.h       | 80 ++++++++++---------
 .../MergeTree/MergeTreeDataSelectExecutor.cpp |  4 +-
 dbms/src/Storages/MergeTree/PKCondition.cpp   | 60 ++++++++++++--
 3 files changed, 99 insertions(+), 45 deletions(-)

diff --git a/dbms/include/DB/Storages/MergeTree/PKCondition.h b/dbms/include/DB/Storages/MergeTree/PKCondition.h
index 7f1bdc7599f..4baa9cd76f0 100644
--- a/dbms/include/DB/Storages/MergeTree/PKCondition.h
+++ b/dbms/include/DB/Storages/MergeTree/PKCondition.h
@@ -124,7 +124,7 @@ struct Range
 private:
 	static bool equals(const Field & lhs, const Field & rhs) { return apply_visitor(FieldVisitorAccurateEquals(), lhs, rhs); }
 	static bool less(const Field & lhs, const Field & rhs) { return apply_visitor(FieldVisitorAccurateLess(), lhs, rhs); }
-	
+
 public:
 	Field left;				/// левая граница, если есть
 	Field right;			/// правая граница, если есть
@@ -132,17 +132,17 @@ public:
 	bool right_bounded; 	/// ограничен ли справа
 	bool left_included; 	/// включает левую границу, если есть
 	bool right_included;	/// включает правую границу, если есть
-	
+
 	/// Всё множество.
 	Range() : left(), right(), left_bounded(false), right_bounded(false), left_included(false), right_included(false) {}
-	
+
 	/// Одна точка.
 	Range(const Field & point) : left(point), right(point), left_bounded(true), right_bounded(true), left_included(true), right_included(true) {}
-	
+
 	/// Ограниченный с двух сторон диапазон.
 	Range(const Field & left_, bool left_included_, const Field & right_, bool right_included_)
 		: left(left_), right(right_), left_bounded(true), right_bounded(true), left_included(left_included_), right_included(right_included_) {}
-	
+
 	static Range createRightBounded(const Field & right_point, bool right_included)
 	{
 		Range r;
@@ -151,7 +151,7 @@ public:
 		r.right_included = right_included;
 		return r;
 	}
-	
+
 	static Range createLeftBounded(const Field & left_point, bool left_included)
 	{
 		Range r;
@@ -160,7 +160,7 @@ public:
 		r.left_included = left_included;
 		return r;
 	}
-	
+
 	/// Установить левую границу.
 	void setLeft(const Field & point, bool included)
 	{
@@ -168,7 +168,7 @@ public:
 		left_bounded = true;
 		left_included = included;
 	}
-	
+
 	/// Установить правую границу.
 	void setRight(const Field & point, bool included)
 	{
@@ -176,13 +176,13 @@ public:
 		right_bounded = true;
 		right_included = included;
 	}
-	
+
 	/// x входит в range
 	bool contains(const Field & x) const
 	{
 		return !leftThan(x) && !rightThan(x);
 	}
-	
+
 	/// x находится левее
 	bool rightThan(const Field & x) const
 	{
@@ -190,7 +190,7 @@ public:
 			? !(less(left, x) || (left_included && equals(x, left)))
 			: false);
 	}
-	
+
 	/// x находится правее
 	bool leftThan(const Field & x) const
 	{
@@ -198,7 +198,7 @@ public:
 			? !(less(x, right) || (right_included && equals(x, right)))
 			: false);
 	}
-	
+
 	bool intersectsRange(const Range & r) const
 	{
 		/// r левее меня.
@@ -219,7 +219,7 @@ public:
 
 		return true;
 	}
-	
+
 	bool containsRange(const Range & r) const
 	{
 		/// r начинается левее меня.
@@ -242,49 +242,57 @@ public:
 
 		return true;
 	}
-	
+
 	String toString() const
 	{
 		std::stringstream str;
-		
+
 		if (!left_bounded)
 			str << "(-inf, ";
 		else
 			str << (left_included ? '[' : '(') << apply_visitor(FieldVisitorToString(), left) << ", ";
-		
+
 		if (!right_bounded)
 			str << "+inf)";
 		else
 			str << apply_visitor(FieldVisitorToString(), right) << (right_included ? ']' : ')');
-		
+
 		return str.str();
 	}
 };
 
 class ASTSet;
+
+
+/** Условие на индекс.
+  *
+  * Состоит из условий на принадлежность ключа всевозможным диапазонам или множествам,
+  *  а также логических связок AND/OR/NOT над этими условиями.
+  *
+  * Составляет reverse polish notation от этих условий
+  *  и умеет вычислять (интерпретировать) её выполнимость над диапазонами ключа.
+  */
 class PKCondition
 {
 public:
 	/// Не учитывает секцию SAMPLE. all_columns - набор всех столбцов таблицы.
 	PKCondition(ASTPtr query, const Context & context, const NamesAndTypesList & all_columns, const SortDescription & sort_descr);
-	
+
 	/// Выполнимо ли условие в диапазоне ключей.
 	/// left_pk и right_pk должны содержать все поля из sort_descr в соответствующем порядке.
 	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk);
-	
+
 	/// Выполнимо ли условие в полубесконечном (не ограниченном справа) диапазоне ключей.
 	/// left_pk должен содержать все поля из sort_descr в соответствующем порядке.
 	bool mayBeTrueAfter(const Field * left_pk);
-	
-	bool alwaysTrue()
-	{
-		return rpn.size() == 1 && rpn[0].function == RPNElement::FUNCTION_UNKNOWN;
-	}
-	
+
+	/// Проверяет, что индекс не может быть использован.
+	bool alwaysUnknown();
+
 	/// Наложить дополнительное условие: значение в столбце column должно быть в диапазоне range.
 	/// Возвращает, есть ли такой столбец в первичном ключе.
 	bool addCondition(const String & column, const Range & range);
-	
+
 	String toString();
 private:
 	/// Выражение хранится в виде обратной польской строки (Reverse Polish Notation).
@@ -303,37 +311,37 @@ private:
 			FUNCTION_AND,
 			FUNCTION_OR,
 		};
-		
+
 		RPNElement() {}
 		RPNElement(Function function_) : function(function_) {}
 		RPNElement(Function function_, size_t key_column_) : function(function_), key_column(key_column_) {}
 		RPNElement(Function function_, size_t key_column_, const Range & range_)
 			: function(function_), range(range_), key_column(key_column_) {}
-		
+
 		String toString();
-		
+
 		Function function;
-		
+
 		/// Для FUNCTION_IN_RANGE и FUNCTION_NOT_IN_RANGE.
 		Range range;
 		size_t key_column;
 		/// Для FUNCTION_IN_SET
 		ASTPtr in_function;
-		
+
 		ASTSet * inFunctionToSet();
 	};
-	
+
 	typedef std::vector<RPNElement> RPN;
 	typedef std::map<String, size_t> ColumnIndices;
-	
+
 	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk, bool right_bounded);
-	
+
 	void traverseAST(ASTPtr & node, Block & block_with_constants);
 	bool atomFromAST(ASTPtr & node, Block & block_with_constants, RPNElement & out);
 	bool operatorFromAST(ASTFunction * func, RPNElement & out);
-	
+
 	RPN rpn;
-	
+
 	SortDescription sort_descr;
 	ColumnIndices pk_columns;
 };
diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 1ddd78bb324..d9c873354ae 100644
--- a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -83,7 +83,7 @@ BlockInputStreams MergeTreeDataSelectExecutor::read(
 	PKCondition key_condition(query, context, data.getColumnsList(), data.getSortDescription());
 	PKCondition date_condition(query, context, data.getColumnsList(), SortDescription(1, SortColumnDescription(data.date_column_name, 1)));
 
-	if (settings.force_index_by_date && date_condition.alwaysTrue())
+	if (settings.force_index_by_date && date_condition.alwaysUnknown())
 		throw Exception("Index by date is not used and setting 'force_index_by_date' is set.", ErrorCodes::INDEX_NOT_USED);
 
 	/// Выберем куски, в которых могут быть данные, удовлетворяющие date_condition, и которые подходят под условие на _part.
@@ -556,7 +556,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPkRange(
 	size_t marks_count = index.size() / key_size;
 
 	/// Если индекс не используется.
-	if (key_condition.alwaysTrue())
+	if (key_condition.alwaysUnknown())
 	{
 		res.push_back(MarkRange(0, marks_count));
 	}
diff --git a/dbms/src/Storages/MergeTree/PKCondition.cpp b/dbms/src/Storages/MergeTree/PKCondition.cpp
index d3108426ad5..c46df1f1223 100644
--- a/dbms/src/Storages/MergeTree/PKCondition.cpp
+++ b/dbms/src/Storages/MergeTree/PKCondition.cpp
@@ -267,7 +267,7 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 		RPNElement & element = rpn[i];
 		if (element.function == RPNElement::FUNCTION_UNKNOWN)
 		{
-			rpn_stack.push_back(BoolMask(true, true));
+			rpn_stack.emplace_back(true, true);
 		}
 		else if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE || element.function == RPNElement::FUNCTION_IN_RANGE)
 		{
@@ -275,7 +275,7 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 			bool intersects = element.range.intersectsRange(key_range);
 			bool contains = element.range.containsRange(key_range);
 
-			rpn_stack.push_back(BoolMask(intersects, !contains));
+			rpn_stack.emplace_back(intersects, !contains);
 			if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE)
 				rpn_stack.back() = !rpn_stack.back();
 		}
@@ -294,7 +294,7 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 			}
 			else
 			{
-				throw DB::Exception("Set for IN is not created yet!");
+				throw DB::Exception("Set for IN is not created yet!", ErrorCodes::LOGICAL_ERROR);
 			}
 		}
 		else if (element.function == RPNElement::FUNCTION_NOT)
@@ -303,16 +303,16 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 		}
 		else if (element.function == RPNElement::FUNCTION_AND)
 		{
-			BoolMask arg1 = rpn_stack.back();
+			auto arg1 = rpn_stack.back();
 			rpn_stack.pop_back();
-			BoolMask arg2 = rpn_stack.back();
+			auto arg2 = rpn_stack.back();
 			rpn_stack.back() = arg1 & arg2;
 		}
 		else if (element.function == RPNElement::FUNCTION_OR)
 		{
-			BoolMask arg1 = rpn_stack.back();
+			auto arg1 = rpn_stack.back();
 			rpn_stack.pop_back();
-			BoolMask arg2 = rpn_stack.back();
+			auto arg2 = rpn_stack.back();
 			rpn_stack.back() = arg1 | arg2;
 		}
 		else
@@ -374,4 +374,50 @@ String PKCondition::RPNElement::toString()
 			return "ERROR";
 	}
 }
+
+
+bool PKCondition::alwaysUnknown()
+{
+	std::vector<UInt8> rpn_stack;
+
+	for (size_t i = 0; i < rpn.size(); ++i)
+	{
+		RPNElement & element = rpn[i];
+
+		if (element.function == RPNElement::FUNCTION_UNKNOWN)
+		{
+			rpn_stack.push_back(true);
+		}
+		else if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE
+			|| element.function == RPNElement::FUNCTION_IN_RANGE
+			|| element.function == RPNElement::FUNCTION_IN_SET
+			|| element.function == RPNElement::FUNCTION_NOT_IN_SET)
+		{
+			rpn_stack.push_back(false);
+		}
+		else if (element.function == RPNElement::FUNCTION_NOT)
+		{
+		}
+		else if (element.function == RPNElement::FUNCTION_AND)
+		{
+			auto arg1 = rpn_stack.back();
+			rpn_stack.pop_back();
+			auto arg2 = rpn_stack.back();
+			rpn_stack.back() = arg1 & arg2;
+		}
+		else if (element.function == RPNElement::FUNCTION_OR)
+		{
+			auto arg1 = rpn_stack.back();
+			rpn_stack.pop_back();
+			auto arg2 = rpn_stack.back();
+			rpn_stack.back() = arg1 | arg2;
+		}
+		else
+			throw Exception("Unexpected function type in PKCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
+	}
+
+	return rpn_stack[0];
+}
+
+
 }

From 06c35e0faa9a4e3bc246204ddcff238b3bbcfe1c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 06:37:46 +0300
Subject: [PATCH 37/55] dbms: improvement [#METR-15484].

---
 dbms/include/DB/Interpreters/Set.h            |  4 +-
 .../DB/Storages/MergeTree/PKCondition.h       | 14 +++----
 dbms/src/Interpreters/Set.cpp                 | 16 ++++----
 dbms/src/Storages/MergeTree/PKCondition.cpp   | 38 +++++++++----------
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/dbms/include/DB/Interpreters/Set.h b/dbms/include/DB/Interpreters/Set.h
index 4b45f73ae96..4b6bfce1d75 100644
--- a/dbms/include/DB/Interpreters/Set.h
+++ b/dbms/include/DB/Interpreters/Set.h
@@ -293,7 +293,7 @@ public:
 	  */
 	void execute(Block & block, const ColumnNumbers & arguments, size_t result, bool negative) const;
 
-	std::string describe()
+	std::string describe() const
 	{
 		if (!ordered_set_elements)
 			return "{}";
@@ -312,7 +312,7 @@ public:
 	}
 
 	/// проверяет есть ли в Set элементы для заданного диапазона индекса
-	BoolMask mayBeTrueInRange(const Range & range);
+	BoolMask mayBeTrueInRange(const Range & range) const;
 
 	size_t getTotalRowCount() const { return data.getTotalRowCount(); }
 	size_t getTotalByteCount() const { return data.getTotalByteCount(); }
diff --git a/dbms/include/DB/Storages/MergeTree/PKCondition.h b/dbms/include/DB/Storages/MergeTree/PKCondition.h
index 4baa9cd76f0..838d762cc96 100644
--- a/dbms/include/DB/Storages/MergeTree/PKCondition.h
+++ b/dbms/include/DB/Storages/MergeTree/PKCondition.h
@@ -280,20 +280,20 @@ public:
 
 	/// Выполнимо ли условие в диапазоне ключей.
 	/// left_pk и right_pk должны содержать все поля из sort_descr в соответствующем порядке.
-	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk);
+	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk) const;
 
 	/// Выполнимо ли условие в полубесконечном (не ограниченном справа) диапазоне ключей.
 	/// left_pk должен содержать все поля из sort_descr в соответствующем порядке.
-	bool mayBeTrueAfter(const Field * left_pk);
+	bool mayBeTrueAfter(const Field * left_pk) const;
 
 	/// Проверяет, что индекс не может быть использован.
-	bool alwaysUnknown();
+	bool alwaysUnknown() const;
 
 	/// Наложить дополнительное условие: значение в столбце column должно быть в диапазоне range.
 	/// Возвращает, есть ли такой столбец в первичном ключе.
 	bool addCondition(const String & column, const Range & range);
 
-	String toString();
+	String toString() const;
 private:
 	/// Выражение хранится в виде обратной польской строки (Reverse Polish Notation).
 	struct RPNElement
@@ -318,7 +318,7 @@ private:
 		RPNElement(Function function_, size_t key_column_, const Range & range_)
 			: function(function_), range(range_), key_column(key_column_) {}
 
-		String toString();
+		String toString() const;
 
 		Function function;
 
@@ -328,13 +328,13 @@ private:
 		/// Для FUNCTION_IN_SET
 		ASTPtr in_function;
 
-		ASTSet * inFunctionToSet();
+		const ASTSet * inFunctionToSet() const;
 	};
 
 	typedef std::vector<RPNElement> RPN;
 	typedef std::map<String, size_t> ColumnIndices;
 
-	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk, bool right_bounded);
+	bool mayBeTrueInRange(const Field * left_pk, const Field * right_pk, bool right_bounded) const;
 
 	void traverseAST(ASTPtr & node, Block & block_with_constants);
 	bool atomFromAST(ASTPtr & node, Block & block_with_constants, RPNElement & out);
diff --git a/dbms/src/Interpreters/Set.cpp b/dbms/src/Interpreters/Set.cpp
index 768a0581ea5..8c4a9dc10f6 100644
--- a/dbms/src/Interpreters/Set.cpp
+++ b/dbms/src/Interpreters/Set.cpp
@@ -558,7 +558,7 @@ void Set::executeArray(const ColumnArray * key_column, ColumnUInt8::Container_t
 }
 
 
-BoolMask Set::mayBeTrueInRange(const Range & range)
+BoolMask Set::mayBeTrueInRange(const Range & range) const
 {
 	if (!ordered_set_elements)
 		throw DB::Exception("Ordered set in not created.");
@@ -588,7 +588,10 @@ BoolMask Set::mayBeTrueInRange(const Range & range)
 	}
 	else
 	{
-		auto left_it = range.left_bounded ? std::lower_bound(ordered_set_elements->begin(), ordered_set_elements->end(), left) : ordered_set_elements->begin();
+		auto left_it = range.left_bounded
+			? std::lower_bound(ordered_set_elements->begin(), ordered_set_elements->end(), left)
+			: ordered_set_elements->begin();
+
 		if (range.left_bounded && !range.left_included && left_it != ordered_set_elements->end() && *left_it == left)
 			++left_it;
 
@@ -599,7 +602,10 @@ BoolMask Set::mayBeTrueInRange(const Range & range)
 		}
 		else
 		{
-			auto right_it = range.right_bounded ? std::upper_bound(ordered_set_elements->begin(), ordered_set_elements->end(), right) : ordered_set_elements->end();
+			auto right_it = range.right_bounded
+				? std::upper_bound(ordered_set_elements->begin(), ordered_set_elements->end(), right)
+				: ordered_set_elements->end();
+
 			if (range.right_bounded && !range.right_included && right_it != ordered_set_elements->begin() && *(right_it--) == right)
 				--right_it;
 
@@ -613,13 +619,9 @@ BoolMask Set::mayBeTrueInRange(const Range & range)
 				--right_it;
 				/// в диапазон не попадает ни одного ключа из in
 				if (*right_it < *left_it)
-				{
 					can_be_true = false;
-				}
 				else
-				{
 					can_be_true = true;
-				}
 			}
 		}
 	}
diff --git a/dbms/src/Storages/MergeTree/PKCondition.cpp b/dbms/src/Storages/MergeTree/PKCondition.cpp
index c46df1f1223..ab680230d44 100644
--- a/dbms/src/Storages/MergeTree/PKCondition.cpp
+++ b/dbms/src/Storages/MergeTree/PKCondition.cpp
@@ -42,7 +42,7 @@ PKCondition::PKCondition(ASTPtr query, const Context & context_, const NamesAndT
 		if (select.prewhere_expression)
 		{
 			traverseAST(select.prewhere_expression, block_with_constants);
-			rpn.push_back(RPNElement(RPNElement::FUNCTION_AND));
+			rpn.emplace_back(RPNElement::FUNCTION_AND);
 		}
 	}
 	else if (select.prewhere_expression)
@@ -51,7 +51,7 @@ PKCondition::PKCondition(ASTPtr query, const Context & context_, const NamesAndT
 	}
 	else
 	{
-		rpn.push_back(RPNElement(RPNElement::FUNCTION_UNKNOWN));
+		rpn.emplace_back(RPNElement::FUNCTION_UNKNOWN);
 	}
 }
 
@@ -59,8 +59,8 @@ bool PKCondition::addCondition(const String & column, const Range & range)
 {
 	if (!pk_columns.count(column))
 		return false;
-	rpn.push_back(RPNElement(RPNElement::FUNCTION_IN_RANGE, pk_columns[column], range));
-	rpn.push_back(RPNElement(RPNElement::FUNCTION_AND));
+	rpn.emplace_back(RPNElement::FUNCTION_IN_RANGE, pk_columns[column], range);
+	rpn.emplace_back(RPNElement::FUNCTION_AND);
 	return true;
 }
 
@@ -224,7 +224,7 @@ bool PKCondition::operatorFromAST(ASTFunction * func, RPNElement & out)
 	return true;
 }
 
-String PKCondition::toString()
+String PKCondition::toString() const
 {
 	String res;
 	for (size_t i = 0; i < rpn.size(); ++i)
@@ -236,7 +236,7 @@ String PKCondition::toString()
 	return res;
 }
 
-bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk, bool right_bounded)
+bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk, bool right_bounded) const
 {
 	/// Найдем диапазоны элементов ключа.
 	std::vector<Range> key_ranges(sort_descr.size(), Range());
@@ -264,7 +264,7 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 	std::vector<BoolMask> rpn_stack;
 	for (size_t i = 0; i < rpn.size(); ++i)
 	{
-		RPNElement & element = rpn[i];
+		const auto & element = rpn[i];
 		if (element.function == RPNElement::FUNCTION_UNKNOWN)
 		{
 			rpn_stack.emplace_back(true, true);
@@ -281,9 +281,9 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 		}
 		else if (element.function == RPNElement::FUNCTION_IN_SET || element.function == RPNElement::FUNCTION_NOT_IN_SET)
 		{
-			ASTFunction * in_func = typeid_cast<ASTFunction *>(element.in_function.get());
-			ASTs & args = typeid_cast<ASTExpressionList &>(*in_func->arguments).children;
-			ASTSet * ast_set = typeid_cast<ASTSet *>(args[1].get());
+			auto in_func = typeid_cast<const ASTFunction *>(element.in_function.get());
+			const ASTs & args = typeid_cast<const ASTExpressionList &>(*in_func->arguments).children;
+			auto ast_set = typeid_cast<const ASTSet *>(args[1].get());
 			if (in_func && ast_set)
 			{
 				const Range & key_range = key_ranges[element.key_column];
@@ -325,27 +325,27 @@ bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk
 	return rpn_stack[0].can_be_true;
 }
 
-bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk)
+bool PKCondition::mayBeTrueInRange(const Field * left_pk, const Field * right_pk) const
 {
 	return mayBeTrueInRange(left_pk, right_pk, true);
 }
 
-bool PKCondition::mayBeTrueAfter(const Field * left_pk)
+bool PKCondition::mayBeTrueAfter(const Field * left_pk) const
 {
 	return mayBeTrueInRange(left_pk, nullptr, false);
 }
 
-ASTSet * PKCondition::RPNElement::inFunctionToSet()
+const ASTSet * PKCondition::RPNElement::inFunctionToSet() const
 {
-	ASTFunction * in_func = typeid_cast<ASTFunction *>(in_function.get());
+	auto in_func = typeid_cast<const ASTFunction *>(in_function.get());
 	if (!in_func)
 		return nullptr;
-	ASTs & args = typeid_cast<ASTExpressionList &>(*in_func->arguments).children;
-	ASTSet * ast_set = typeid_cast<ASTSet *>(args[1].get());
+	const ASTs & args = typeid_cast<const ASTExpressionList &>(*in_func->arguments).children;
+	auto ast_set = typeid_cast<const ASTSet *>(args[1].get());
 	return ast_set;
 }
 
-String PKCondition::RPNElement::toString()
+String PKCondition::RPNElement::toString() const
 {
 	std::ostringstream ss;
 	switch (function)
@@ -376,13 +376,13 @@ String PKCondition::RPNElement::toString()
 }
 
 
-bool PKCondition::alwaysUnknown()
+bool PKCondition::alwaysUnknown() const
 {
 	std::vector<UInt8> rpn_stack;
 
 	for (size_t i = 0; i < rpn.size(); ++i)
 	{
-		RPNElement & element = rpn[i];
+		const auto & element = rpn[i];
 
 		if (element.function == RPNElement::FUNCTION_UNKNOWN)
 		{

From fed50c4702eb65fb49ea2ce15edf50111b644671 Mon Sep 17 00:00:00 2001
From: Andrey Mironov <hertz@yandex-team.ru>
Date: Fri, 27 Mar 2015 16:10:35 +0300
Subject: [PATCH 38/55] dbms: add tryLogCurrentException overload taking a
 pointer to Poco::Logger

---
 dbms/include/DB/Core/Exception.h |  2 ++
 dbms/src/Core/Exception.cpp      | 13 +++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/dbms/include/DB/Core/Exception.h b/dbms/include/DB/Core/Exception.h
index 047c156e491..afc56e6571f 100644
--- a/dbms/include/DB/Core/Exception.h
+++ b/dbms/include/DB/Core/Exception.h
@@ -6,6 +6,7 @@
 #include <statdaemons/Exception.h>
 #include <Poco/SharedPtr.h>
 
+namespace Poco { class Logger; }
 
 namespace DB
 {
@@ -28,6 +29,7 @@ ExceptionPtr cloneCurrentException();
   * Можно использовать в деструкторах в блоке catch (...).
   */
 void tryLogCurrentException(const char * log_name);
+void tryLogCurrentException(Poco::Logger * logger);
 
 
 void rethrowFirstException(Exceptions & exceptions);
diff --git a/dbms/src/Core/Exception.cpp b/dbms/src/Core/Exception.cpp
index 2571762489e..f5864a4f210 100644
--- a/dbms/src/Core/Exception.cpp
+++ b/dbms/src/Core/Exception.cpp
@@ -53,6 +53,11 @@ inline std::string demangle(const char * const mangled, int & status)
 }
 
 void tryLogCurrentException(const char * log_name)
+{
+	tryLogCurrentException(&Logger::get(log_name));
+}
+
+void tryLogCurrentException(Poco::Logger * logger)
 {
 	try
 	{
@@ -62,7 +67,7 @@ void tryLogCurrentException(const char * log_name)
 	{
 		try
 		{
-			LOG_ERROR(&Logger::get(log_name), "Code: " << e.code() << ", e.displayText() = " << e.displayText() << ", e.what() = " << e.what()
+			LOG_ERROR(logger, "Code: " << e.code() << ", e.displayText() = " << e.displayText() << ", e.what() = " << e.what()
 				<< ", Stack trace:\n\n" << e.getStackTrace().toString());
 		}
 		catch (...) {}
@@ -71,7 +76,7 @@ void tryLogCurrentException(const char * log_name)
 	{
 		try
 		{
-			LOG_ERROR(&Logger::get(log_name), "Poco::Exception. Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code()
+			LOG_ERROR(logger, "Poco::Exception. Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code()
 				<< ", e.displayText() = " << e.displayText() << ", e.what() = " << e.what());
 		}
 		catch (...) {}
@@ -86,7 +91,7 @@ void tryLogCurrentException(const char * log_name)
 			if (status)
 				name += " (demangling status: " + toString(status) + ")";
 
-			LOG_ERROR(&Logger::get(log_name), "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what());
+			LOG_ERROR(logger, "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what());
 		}
 		catch (...) {}
 	}
@@ -100,7 +105,7 @@ void tryLogCurrentException(const char * log_name)
 			if (status)
 				name += " (demangling status: " + toString(status) + ")";
 
-			LOG_ERROR(&Logger::get(log_name), "Unknown exception. Code: " << ErrorCodes::UNKNOWN_EXCEPTION << ", type: " << name);
+			LOG_ERROR(logger, "Unknown exception. Code: " << ErrorCodes::UNKNOWN_EXCEPTION << ", type: " << name);
 		}
 		catch (...) {}
 	}

From edbe23ac1fb31621233698adf28c171820bebec0 Mon Sep 17 00:00:00 2001
From: Andrey Mironov <hertz@yandex-team.ru>
Date: Fri, 27 Mar 2015 16:11:22 +0300
Subject: [PATCH 39/55] dbms: add dictionaries_lazy_load config parameter,
 defaults to true. [#METR-15691]

---
 dbms/include/DB/Interpreters/Context.h        |  3 +++
 dbms/include/DB/Interpreters/Dictionaries.h   | 21 +++++++++++--------
 .../DB/Interpreters/ExternalDictionaries.h    |  8 +++----
 dbms/src/Interpreters/Context.cpp             | 20 +++++++++++++-----
 .../src/Interpreters/ExternalDictionaries.cpp | 13 ++++++++----
 dbms/src/Server/Server.cpp                    | 19 +++++++++++++++--
 6 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/dbms/include/DB/Interpreters/Context.h b/dbms/include/DB/Interpreters/Context.h
index 18a228bef83..e8da76406f6 100644
--- a/dbms/include/DB/Interpreters/Context.h
+++ b/dbms/include/DB/Interpreters/Context.h
@@ -263,6 +263,9 @@ public:
 	const Dictionaries & getDictionaries() const;
 	const ExternalDictionaries & getExternalDictionaries() const;
 
+	void tryCreateDictionaries(bool throw_on_error = false) const;
+	void tryCreateExternalDictionaries(bool throw_on_error = false) const;
+
 	InterserverIOHandler & getInterserverIOHandler()						{ return shared->interserver_io_handler; }
 
 	/// Как другие серверы могут обратиться к этому для скачивания реплицируемых данных.
diff --git a/dbms/include/DB/Interpreters/Dictionaries.h b/dbms/include/DB/Interpreters/Dictionaries.h
index 94e12484daa..3ee8f5c389b 100644
--- a/dbms/include/DB/Interpreters/Dictionaries.h
+++ b/dbms/include/DB/Interpreters/Dictionaries.h
@@ -31,8 +31,10 @@ private:
 
 
 
-	void handleException() const
+	void handleException(const bool throw_on_error) const
 	{
+		const auto exception_ptr = std::current_exception();
+
 		try
 		{
 			throw;
@@ -40,18 +42,19 @@ private:
 		catch (const Poco::Exception & e)
 		{
 			LOG_ERROR(log, "Cannot load dictionary! You must resolve this manually. " << e.displayText());
-			return;
 		}
 		catch (...)
 		{
 			LOG_ERROR(log, "Cannot load dictionary! You must resolve this manually.");
-			return;
 		}
+
+		if (throw_on_error)
+			std::rethrow_exception(exception_ptr);
 	}
 
 
 	/// Обновляет справочники.
-	void reloadImpl()
+	void reloadImpl(const bool throw_on_error = false)
 	{
 		/** Если не удаётся обновить справочники, то несмотря на это, не кидаем исключение (используем старые справочники).
 		  * Если старых корректных справочников нет, то при использовании функций, которые от них зависят,
@@ -70,7 +73,7 @@ private:
 		}
 		catch (...)
 		{
-			handleException();
+			handleException(throw_on_error);
 			was_exception = true;
 		}
 
@@ -83,7 +86,7 @@ private:
 		}
 		catch (...)
 		{
-			handleException();
+			handleException(throw_on_error);
 			was_exception = true;
 		}
 
@@ -95,7 +98,7 @@ private:
 		}
 		catch (...)
 		{
-			handleException();
+			handleException(throw_on_error);
 			was_exception = true;
 		}
 
@@ -119,10 +122,10 @@ private:
 
 public:
 	/// Справочники будут обновляться в отдельном потоке, каждые reload_period секунд.
-	Dictionaries(int reload_period_ = 3600)
+	Dictionaries(const bool throw_on_error, const int reload_period_ = 3600)
 		: reload_period(reload_period_), log(&Logger::get("Dictionaries"))
 	{
-		reloadImpl();
+		reloadImpl(throw_on_error);
 		reloading_thread = std::thread([this] { reloadPeriodically(); });
 	}
 
diff --git a/dbms/include/DB/Interpreters/ExternalDictionaries.h b/dbms/include/DB/Interpreters/ExternalDictionaries.h
index ccb35d321db..2ad0b9975ec 100644
--- a/dbms/include/DB/Interpreters/ExternalDictionaries.h
+++ b/dbms/include/DB/Interpreters/ExternalDictionaries.h
@@ -59,8 +59,8 @@ private:
 
 	std::unordered_map<std::string, Poco::Timestamp> last_modification_times;
 
-	void reloadImpl();
-	void reloadFromFile(const std::string & config_path);
+	void reloadImpl(bool throw_on_error = false);
+	void reloadFromFile(const std::string & config_path, bool throw_on_error);
 
 	void reloadPeriodically()
 	{
@@ -82,10 +82,10 @@ private:
 
 public:
 	/// Справочники будут обновляться в отдельном потоке, каждые reload_period секунд.
-	ExternalDictionaries(Context & context)
+	ExternalDictionaries(Context & context, const bool throw_on_error)
 		: context(context), log(&Logger::get("ExternalDictionaries"))
 	{
-		reloadImpl();
+		reloadImpl(throw_on_error);
 		reloading_thread = std::thread{&ExternalDictionaries::reloadPeriodically, this};
 	}
 
diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp
index 3e123550a44..41c5faecdbb 100644
--- a/dbms/src/Interpreters/Context.cpp
+++ b/dbms/src/Interpreters/Context.cpp
@@ -494,8 +494,7 @@ const Dictionaries & Context::getDictionaries() const
 {
 	Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
 
-	if (!shared->dictionaries)
-		shared->dictionaries = new Dictionaries;
+	tryCreateDictionaries();
 
 	return *shared->dictionaries;
 }
@@ -505,14 +504,25 @@ const ExternalDictionaries & Context::getExternalDictionaries() const
 {
 	Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
 
+	tryCreateExternalDictionaries();
+
+	return *shared->external_dictionaries;
+}
+
+void Context::tryCreateDictionaries(const bool throw_on_error) const
+{
+	if (!shared->dictionaries)
+		shared->dictionaries = new Dictionaries{throw_on_error};
+}
+
+void Context::tryCreateExternalDictionaries(const bool throw_on_error) const
+{
 	if (!shared->external_dictionaries)
 	{
 		if (!this->global_context)
 			throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR);
-		shared->external_dictionaries = new ExternalDictionaries{*this->global_context};
+		shared->external_dictionaries = new ExternalDictionaries{*this->global_context, throw_on_error};
 	}
-
-	return *shared->external_dictionaries;
 }
 
 
diff --git a/dbms/src/Interpreters/ExternalDictionaries.cpp b/dbms/src/Interpreters/ExternalDictionaries.cpp
index 600780a3e0c..3c3f099bcc2 100644
--- a/dbms/src/Interpreters/ExternalDictionaries.cpp
+++ b/dbms/src/Interpreters/ExternalDictionaries.cpp
@@ -34,12 +34,12 @@ namespace
 	}
 }
 
-void ExternalDictionaries::reloadImpl()
+void ExternalDictionaries::reloadImpl(const bool throw_on_error)
 {
 	const auto config_paths = getDictionariesConfigPaths(Poco::Util::Application::instance().config());
 
 	for (const auto & config_path : config_paths)
-		reloadFromFile(config_path);
+		reloadFromFile(config_path, throw_on_error);
 
 	/// periodic update
 	for (auto & dictionary : dictionaries)
@@ -109,7 +109,7 @@ void ExternalDictionaries::reloadImpl()
 	}
 }
 
-void ExternalDictionaries::reloadFromFile(const std::string & config_path)
+void ExternalDictionaries::reloadFromFile(const std::string & config_path, const bool throw_on_error)
 {
 	const Poco::File config_file{config_path};
 
@@ -197,8 +197,9 @@ void ExternalDictionaries::reloadFromFile(const std::string & config_path)
 				}
 				catch (...)
 				{
+					const auto exception_ptr = std::current_exception();
 					if (!name.empty())
-						stored_exceptions.emplace(name, std::current_exception());
+						stored_exceptions.emplace(name, exception_ptr);
 
 					try
 					{
@@ -219,6 +220,10 @@ void ExternalDictionaries::reloadFromFile(const std::string & config_path)
 						LOG_ERROR(log, config_path << ": cannot create external dictionary '" << name
 							<< "'! You must resolve this manually.");
 					}
+
+					/// propagate exception
+					if (throw_on_error)
+						std::rethrow_exception(exception_ptr);
 				}
 			}
 		}
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index ef3bb3bb1df..a6671e7e26d 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -609,9 +609,24 @@ int Server::main(const std::vector<std::string> & args)
 		if (olap_http_server)
 			olap_http_server->start();
 
-		LOG_INFO(log, "Ready for connections.");
+		/// try to load dictionaries immediately, throw on error and die
+		try
+		{
+			if (!config().getBool("dictionaries_lazy_load", true))
+			{
+				global_context->tryCreateDictionaries(true);
+				global_context->tryCreateExternalDictionaries(true);
+			}
 
-		waitForTerminationRequest();
+			LOG_INFO(log, "Ready for connections.");
+
+			waitForTerminationRequest();
+		}
+		catch (...)
+		{
+			LOG_ERROR(log, "Caught exception while loading dictionaries.");
+			tryLogCurrentException(log);
+		}
 
 		LOG_DEBUG(log, "Received termination signal. Waiting for current connections to close.");
 

From 082620d05422511ac3195c8e1fae3b1f3b25148a Mon Sep 17 00:00:00 2001
From: Andrey Mironov <hertz@yandex-team.ru>
Date: Fri, 27 Mar 2015 18:44:32 +0300
Subject: [PATCH 40/55] dbms: do not attempt loading builtin dictionaries if
 config has no corresponding keys. [#METR-15691]

---
 dbms/include/DB/Interpreters/Dictionaries.h | 63 ++++++++++++---------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/dbms/include/DB/Interpreters/Dictionaries.h b/dbms/include/DB/Interpreters/Dictionaries.h
index 3ee8f5c389b..24f9030e808 100644
--- a/dbms/include/DB/Interpreters/Dictionaries.h
+++ b/dbms/include/DB/Interpreters/Dictionaries.h
@@ -64,42 +64,53 @@ private:
 
 		LOG_INFO(log, "Loading dictionaries.");
 
+		auto & config = Poco::Util::Application::instance().config();
+
 		bool was_exception = false;
 
-		try
+		if (config.has(TechDataHierarchy::required_key))
 		{
-			MultiVersion<TechDataHierarchy>::Version new_tech_data_hierarchy = new TechDataHierarchy;
-			tech_data_hierarchy.set(new_tech_data_hierarchy);
-		}
-		catch (...)
-		{
-			handleException(throw_on_error);
-			was_exception = true;
+			try
+			{
+				auto new_tech_data_hierarchy = std::make_unique<TechDataHierarchy>();
+				tech_data_hierarchy.set(new_tech_data_hierarchy.release());
+			}
+			catch (...)
+			{
+				handleException(throw_on_error);
+				was_exception = true;
+			}
 		}
 
-		try
-		{
-			MultiVersion<RegionsHierarchies>::Version new_regions_hierarchies = new RegionsHierarchies;
-			new_regions_hierarchies->reload();
-			regions_hierarchies.set(new_regions_hierarchies);
 
-		}
-		catch (...)
+		if (config.has(RegionsHierarchies::required_key))
 		{
-			handleException(throw_on_error);
-			was_exception = true;
+			try
+			{
+				auto new_regions_hierarchies = std::make_unique<RegionsHierarchies>();
+				new_regions_hierarchies->reload();
+				regions_hierarchies.set(new_regions_hierarchies.release());
+			}
+			catch (...)
+			{
+				handleException(throw_on_error);
+				was_exception = true;
+			}
 		}
 
-		try
+		if (config.has(RegionsNames::required_key))
 		{
-			MultiVersion<RegionsNames>::Version new_regions_names = new RegionsNames;
-			new_regions_names->reload();
-			regions_names.set(new_regions_names);
-		}
-		catch (...)
-		{
-			handleException(throw_on_error);
-			was_exception = true;
+			try
+			{
+				auto new_regions_names = std::make_unique<RegionsNames>();
+				new_regions_names->reload();
+				regions_names.set(new_regions_names.release());
+			}
+			catch (...)
+			{
+				handleException(throw_on_error);
+				was_exception = true;
+			}
 		}
 
 		if (!was_exception)

From b05b41a12c23acc7f7b180620035248597add599 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 21:43:09 +0300
Subject: [PATCH 41/55] dbms: Client: added --time option for benchmark
 [#METR-15716].

---
 dbms/src/Client/Client.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp
index 84263b56404..c38bb15fae3 100644
--- a/dbms/src/Client/Client.cpp
+++ b/dbms/src/Client/Client.cpp
@@ -93,6 +93,7 @@ private:
 	};
 
 	bool is_interactive = true;			/// Использовать readline интерфейс или batch режим.
+	bool print_time_to_stderr = false;	/// В неинтерактивном режиме, выводить время выполнения в stderr.
 	bool stdin_is_not_tty = false;		/// stdin - не терминал.
 
 	winsize terminal_size {};			/// Размер терминала - для вывода прогресс-бара.
@@ -257,6 +258,9 @@ private:
 
 		if (is_interactive)
 		{
+			if (print_time_to_stderr)
+				throw Exception("time option could be specified only in non-interactive mode", ErrorCodes::BAD_ARGUMENTS);
+
 			/// Отключаем tab completion.
 			rl_bind_key('\t', rl_insert);
 
@@ -557,6 +561,10 @@ private:
 
 			std::cout << std::endl << std::endl;
 		}
+		else if (print_time_to_stderr)
+		{
+			std::cerr << watch.elapsedSeconds() << "\n";
+		}
 
 		return true;
 	}
@@ -1023,6 +1031,7 @@ public:
 			("multiline,m",														"multiline")
 			("multiquery,n",													"multiquery")
 			("vertical,E",                                                      "vertical")
+			("time,t",			"print query execution time to stderr in non-interactive mode (for benchmarks)")
 			APPLY_FOR_SETTINGS(DECLARE_SETTING)
 			APPLY_FOR_LIMITS(DECLARE_LIMIT)
 		;
@@ -1137,6 +1146,8 @@ public:
 			config().setBool("multiquery", true);
 		if (options.count("vertical"))
 			config().setBool("vertical", true);
+		if (options.count("time"))
+			print_time_to_stderr = true;
 	}
 };
 

From 369a9441ee1f54c98a730781df7b9ede6e77e538 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 27 Mar 2015 22:58:35 +0300
Subject: [PATCH 42/55] dbms: Removed old scripts [#MTRSADMIN-1121].

---
 dbms/scripts/README                          | 11 +++++++
 dbms/scripts/geobase_to_regions_hierarchy.pl | 31 --------------------
 dbms/scripts/geobase_to_regions_names.pl     | 25 ----------------
 3 files changed, 11 insertions(+), 56 deletions(-)
 create mode 100644 dbms/scripts/README
 delete mode 100755 dbms/scripts/geobase_to_regions_hierarchy.pl
 delete mode 100755 dbms/scripts/geobase_to_regions_names.pl

diff --git a/dbms/scripts/README b/dbms/scripts/README
new file mode 100644
index 00000000000..e736c4942ce
--- /dev/null
+++ b/dbms/scripts/README
@@ -0,0 +1,11 @@
+# How to create dictionaries for region* functions:
+# 1. You need access to host ███████████.yandex-team.ru.
+# 2. Do the following commands:
+
+curl 'http://███████████.yandex-team.ru/?fields=id,parent_id,type,population' | tail -n+2 > regions_hierarchy.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,parent_id,type,population&new_parents=977:187' | tail -n+2 > regions_hierarchy_ua.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,ru_name' | tail -n+2 > regions_names_ru.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,uk_name' | tail -n+2 > regions_names_ua.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,by_name' | tail -n+2 > regions_names_by.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,kz_name' | tail -n+2 > regions_names_kz.txt
+curl 'http://███████████.yandex-team.ru/?fields=id,tr_name' | tail -n+2 > regions_names_tr.txt
diff --git a/dbms/scripts/geobase_to_regions_hierarchy.pl b/dbms/scripts/geobase_to_regions_hierarchy.pl
deleted file mode 100755
index 8d7d23dd148..00000000000
--- a/dbms/scripts/geobase_to_regions_hierarchy.pl
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-use geobase;
-
-
-sub get_population {
-	my $key = shift;
-	my $depth = shift || 0;
-
-	return 0 if ($depth > 100);
-
-	my $current = int($Region{$key}->{zip_old} || 0); # zip_old, не смотря на название, содержит население региона.
-	return $current if ($current);
-
-	my $sum_of_children = 0;
-	for my $child (@{$Region{$key}->{chld}}) {
-		$sum_of_children += get_population($child, $depth + 1);
-	}
-
-	return $sum_of_children;
-}
-
-
-foreach my $key (keys %Region) {
-	print $key . "\t"
-		. ($Region{$key}->{parents}[-1] || 0) . "\t"
-		. ($Region{$key}->{type} || 0) . "\t"
-		. get_population($key) . "\n";
-}
diff --git a/dbms/scripts/geobase_to_regions_names.pl b/dbms/scripts/geobase_to_regions_names.pl
deleted file mode 100755
index ad280ac27e6..00000000000
--- a/dbms/scripts/geobase_to_regions_names.pl
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-use geobase;
-
-my @languages = ('ru', 'en', 'ua', 'by', 'kz', 'tr');
-my @output_files = map { open(my $output, ">:encoding(UTF-8)", "regions_names_" . $_ . ".txt") || die $!; $output } @languages;
-my %outputs;
-@outputs{@languages} = @output_files;
-
-foreach my $key (keys %Region) {
-	foreach my $lang (@languages) {
-		my $field = ( $lang eq 'ru' ? 'name' : $lang . '_name' );
-		my $name = $Region{$key}->{$field};
-		if ($name) {
-			$name =~ s/^\s+//;
-			$name =~ s/\s+$//;
-			$name =~ s/(\t|\n)/ /g;
-			if ($name ne '') {
-				print { $outputs{$lang} } $key . "\t" . $name . "\n";
-			}
-		}
-	}	
-}

From e72052a57dc1abfb583bfccce0201bf8258fc953 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 28 Mar 2015 22:53:34 +0300
Subject: [PATCH 43/55] dbms: Client: added --format command line option
 [#METR-15716].

---
 dbms/src/Client/Client.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp
index c38bb15fae3..7172a0bfc20 100644
--- a/dbms/src/Client/Client.cpp
+++ b/dbms/src/Client/Client.cpp
@@ -1030,7 +1030,8 @@ public:
 			("database,d", 		boost::program_options::value<std::string>(), 	"database")
 			("multiline,m",														"multiline")
 			("multiquery,n",													"multiquery")
-			("vertical,E",                                                      "vertical")
+			("format,f",        boost::program_options::value<std::string>(), 	"default output format")
+			("vertical,E",      "vertical output format, same as --format=Vertical or FORMAT Vertical or \\G at end of command")
 			("time,t",			"print query execution time to stderr in non-interactive mode (for benchmarks)")
 			APPLY_FOR_SETTINGS(DECLARE_SETTING)
 			APPLY_FOR_LIMITS(DECLARE_LIMIT)
@@ -1144,6 +1145,8 @@ public:
 			config().setBool("multiline", true);
 		if (options.count("multiquery"))
 			config().setBool("multiquery", true);
+		if (options.count("format"))
+			config().setString("format", options["format"].as<std::string>());
 		if (options.count("vertical"))
 			config().setBool("vertical", true);
 		if (options.count("time"))

From dcf5ed4f54b309789cf24766555c5ceb156a0fad Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 28 Mar 2015 22:58:07 +0300
Subject: [PATCH 44/55] dbms: added new benchmark script [#METR-15716].

---
 dbms/benchmark/clickhouse/benchmark-new.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100755 dbms/benchmark/clickhouse/benchmark-new.sh

diff --git a/dbms/benchmark/clickhouse/benchmark-new.sh b/dbms/benchmark/clickhouse/benchmark-new.sh
new file mode 100755
index 00000000000..2692615bb91
--- /dev/null
+++ b/dbms/benchmark/clickhouse/benchmark-new.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+QUERIES_FILE="queries.sql"
+TABLE=$1
+TRIES=3
+
+cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
+	sync
+	echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
+
+	echo -n "["
+	for i in $(seq 1 $TRIES); do
+		RES=$(clickhouse-client --time --format=Null --query="$query" 2>&1)
+		[[ "$?" == "0" ]] && echo -n "${RES}" || echo "null"
+		[[ "$i" != $TRIES ]] && echo -n ", "
+	done
+	echo "],"
+done

From 1ae580dcd9361879005c1bccff0c67cafb6b035d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 00:45:27 +0300
Subject: [PATCH 45/55] dbms: modified queries file [#METR-15716].

---
 dbms/benchmark/clickhouse/queries.sql | 152 ++++++++------------------
 1 file changed, 43 insertions(+), 109 deletions(-)

diff --git a/dbms/benchmark/clickhouse/queries.sql b/dbms/benchmark/clickhouse/queries.sql
index 2d3e80dd657..05d4e00fb19 100644
--- a/dbms/benchmark/clickhouse/queries.sql
+++ b/dbms/benchmark/clickhouse/queries.sql
@@ -1,109 +1,43 @@
-SELECT count() FROM hits_10m;
-SELECT count() FROM hits_10m  WHERE AdvEngineID != 0;
-SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM hits_10m ;
-SELECT sum(UserID) FROM hits_10m ;
-SELECT uniq(UserID) FROM hits_10m ;
-SELECT uniq(SearchPhrase) FROM hits_10m ;
-SELECT min(EventDate), max(EventDate) FROM hits_10m ;
-
-SELECT AdvEngineID, count() FROM hits_10m  WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;
--- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
-
-SELECT RegionID, uniq(UserID) AS u FROM hits_10m  GROUP BY RegionID ORDER BY u DESC LIMIT 10;
--- агрегация, среднее количество ключей.;
-
-SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM hits_10m  GROUP BY RegionID ORDER BY c DESC LIMIT 10;
--- агрегация, среднее количество ключей, несколько агрегатных функций.;
-
-SELECT MobilePhoneModel, uniq(UserID) AS u FROM hits_10m  WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
--- мощная фильтрация по строкам, затем агрегация по строкам.;
-
-SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM hits_10m  WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
--- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
-
-SELECT SearchPhrase, count() AS c FROM hits_10m  WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
--- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
-
-SELECT SearchPhrase, uniq(UserID) AS u FROM hits_10m  WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
--- агрегация чуть сложнее.;
-
-SELECT SearchEngineID, SearchPhrase, count() AS c FROM hits_10m  WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
--- агрегация по числу и строке, большое количество ключей.;
-
-SELECT UserID, count() FROM hits_10m  GROUP BY UserID ORDER BY count() DESC LIMIT 10;
--- агрегация по очень большому количеству ключей, может не хватить оперативки.;
-
-SELECT UserID, SearchPhrase, count() FROM hits_10m  GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;
--- ещё более сложная агрегация.;
-
-SELECT UserID, SearchPhrase, count() FROM hits_10m  GROUP BY UserID, SearchPhrase LIMIT 10;
--- то же самое, но без сортировки.;
-
-SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM hits_10m  GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;
--- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
-
-SELECT UserID FROM hits_10m  WHERE UserID = 12345678901234567890;
--- мощная фильтрация по столбцу типа UInt64.;
-
-SELECT count() FROM hits_10m  WHERE URL LIKE '%metrika%';
--- фильтрация по поиску подстроки в строке.;
-
-SELECT SearchPhrase, any(URL), count() AS c FROM hits_10m  WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
--- вынимаем большие столбцы, фильтрация по строке.;
-
-SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM hits_10m  WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
--- чуть больше столбцы.;
-
-SELECT * FROM hits_10m  WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
--- плохой запрос - вынимаем все столбцы.;
-
-SELECT SearchPhrase FROM hits_10m  WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
--- большая сортировка.;
-
-SELECT SearchPhrase FROM hits_10m  WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
--- большая сортировка по строкам.;
-
-SELECT SearchPhrase FROM hits_10m  WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
--- большая сортировка по кортежу.;
-
-SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM hits_10m  WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
--- считаем средние длины URL для крупных счётчиков.;
-
-SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM hits_10m  WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
--- то же самое, но с разбивкой по доменам.;
-
-SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m ;
--- много тупых агрегатных функций.;
-
-SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m  WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
--- сложная агрегация, для больших таблиц может не хватить оперативки.;
-
-SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m  WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
--- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
-
-SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m  GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
--- то же самое, но ещё и без фильтрации.;
-
-SELECT URL, count() AS c FROM hits_10m  GROUP BY URL ORDER BY c DESC LIMIT 10;
--- агрегация по URL.;
-
-SELECT 1, URL, count() AS c FROM hits_10m  GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
--- агрегация по URL и числу.;
-
-SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_10m  GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;
- 
-SELECT    URL,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT DontCountHits    AND NOT Refresh    AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
-
-
-SELECT    Title,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT DontCountHits    AND NOT Refresh    AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
-
-SELECT    URL,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT Refresh    AND IsLink    AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
-
-SELECT    TraficSourceID,    SearchEngineID,    AdvEngineID,    ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src,    URL AS Dst,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT Refresh GROUP BY    TraficSourceID,    SearchEngineID,    AdvEngineID,    Src,    Dst ORDER BY PageViews DESC LIMIT 1000;
-
-SELECT    URLHash,    EventDate,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT Refresh    AND TraficSourceID IN (-1, 6)    AND RefererHash = halfMD5('http://example.ru/') GROUP BY    URLHash,    EventDate ORDER BY PageViews DESC LIMIT 100;
-
-
-SELECT    WindowClientWidth,    WindowClientHeight,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-31')    AND NOT Refresh    AND NOT DontCountHits    AND URLHash = halfMD5('http://example.ru/') GROUP BY    WindowClientWidth,    WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
-
-SELECT    toStartOfMinute(EventTime) AS Minute,    count() AS PageViews FROM hits_10m WHERE    CounterID = 34    AND EventDate >= toDate('2013-07-01')    AND EventDate <= toDate('2013-07-02')    AND NOT Refresh    AND NOT DontCountHits GROUP BY    Minute ORDER BY Minute;
\ No newline at end of file
+SELECT count() FROM {table};
+SELECT count() FROM {table} WHERE AdvEngineID != 0;
+SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM {table} ;
+SELECT sum(UserID) FROM {table} ;
+SELECT uniq(UserID) FROM {table} ;
+SELECT uniq(SearchPhrase) FROM {table} ;
+SELECT min(EventDate), max(EventDate) FROM {table} ;
+SELECT AdvEngineID, count() FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;
+SELECT RegionID, uniq(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
+SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
+SELECT MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, uniq(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+SELECT SearchEngineID, SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT UserID, count() FROM {table} GROUP BY UserID ORDER BY count() DESC LIMIT 10;
+SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;
+SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
+SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;
+SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
+SELECT count() FROM {table} WHERE URL LIKE '%metrika%';
+SELECT SearchPhrase, any(URL), count() AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
+SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
+SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
+SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
+SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT URL, count() AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10;
+SELECT 1, URL, count() AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
+SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;
+SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
+SELECT Title, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
+SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src, URL AS Dst, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
+SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
+SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
+SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

From 17e285892df69e07dca0adebda4ad3d36976bb0b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 01:23:57 +0300
Subject: [PATCH 46/55] dbms: updated benchmark script [#METR-15716].

---
 dbms/benchmark/clickhouse/benchmark-new.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/benchmark/clickhouse/benchmark-new.sh b/dbms/benchmark/clickhouse/benchmark-new.sh
index 2692615bb91..b09ed1c01f6 100755
--- a/dbms/benchmark/clickhouse/benchmark-new.sh
+++ b/dbms/benchmark/clickhouse/benchmark-new.sh
@@ -11,7 +11,7 @@ cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
 	echo -n "["
 	for i in $(seq 1 $TRIES); do
 		RES=$(clickhouse-client --time --format=Null --query="$query" 2>&1)
-		[[ "$?" == "0" ]] && echo -n "${RES}" || echo "null"
+		[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
 		[[ "$i" != $TRIES ]] && echo -n ", "
 	done
 	echo "],"

From 79c6b9a204ccb652d48ebe7b2d32019ee1161623 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 04:51:01 +0300
Subject: [PATCH 47/55] dbms: better diagnostics [#METR-15574].

---
 dbms/src/Parsers/ParserInsertQuery.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dbms/src/Parsers/ParserInsertQuery.cpp b/dbms/src/Parsers/ParserInsertQuery.cpp
index a1741f62502..7013f8bc05a 100644
--- a/dbms/src/Parsers/ParserInsertQuery.cpp
+++ b/dbms/src/Parsers/ParserInsertQuery.cpp
@@ -111,6 +111,15 @@ bool ParserInsertQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected &
 		ParserWhiteSpaceOrComments ws_without_nl(false);
 
 		ws_without_nl.ignore(pos, end);
+		if (pos != end && *pos == ';')
+			throw Exception("You have excessive ';' symbol before data for INSERT.\n"
+				"Example:\n\n"
+				"INSERT INTO t (x, y) FORMAT TabSeparated\n"
+				"1\tHello\n"
+				"2\tWorld\n"
+				"\n"
+				"Note that there is no ';' in first line.", ErrorCodes::SYNTAX_ERROR);
+
 		if (pos != end && *pos == '\n')
 			++pos;
 

From 093ba78f800f44afd49ccc2f087e10e030962deb Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 06:43:17 +0300
Subject: [PATCH 48/55] dbms: fixed Compiler [#METR-2944].

---
 dbms/src/Interpreters/Compiler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbms/src/Interpreters/Compiler.cpp b/dbms/src/Interpreters/Compiler.cpp
index bc26fa1339a..995015227ea 100644
--- a/dbms/src/Interpreters/Compiler.cpp
+++ b/dbms/src/Interpreters/Compiler.cpp
@@ -218,6 +218,7 @@ void Compiler::compile(
 		" -I /usr/share/clickhouse/headers/libs/libcityhash/"
 		" -I /usr/share/clickhouse/headers/libs/libcommon/include/"
 		" -I /usr/share/clickhouse/headers/libs/libdouble-conversion/"
+		" -I /usr/share/clickhouse/headers/libs/libcpuid/include/"
 		" -I /usr/share/clickhouse/headers/libs/libmysqlxx/include/"
 		" -I /usr/share/clickhouse/headers/libs/libstatdaemons/include/"
 		" -I /usr/share/clickhouse/headers/libs/libstats/include/"

From 92c2a9ceaaf5257c9abe9fc90bed8c6e4abf58f6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 10:13:38 +0300
Subject: [PATCH 49/55] dbms: better diagnostics of errors in TabSeparated
 format (development) [#METR-15574].

---
 .../BlockInputStreamFromRowInputStream.h      |   2 +
 .../DataStreams/TabSeparatedRowInputStream.h  |  29 ++
 dbms/src/Client/Client.cpp                    |  47 +++-
 .../TabSeparatedRowInputStream.cpp            | 248 +++++++++++++++++-
 4 files changed, 314 insertions(+), 12 deletions(-)

diff --git a/dbms/include/DB/DataStreams/BlockInputStreamFromRowInputStream.h b/dbms/include/DB/DataStreams/BlockInputStreamFromRowInputStream.h
index 7385ef2b0dc..a7423ec3470 100644
--- a/dbms/include/DB/DataStreams/BlockInputStreamFromRowInputStream.h
+++ b/dbms/include/DB/DataStreams/BlockInputStreamFromRowInputStream.h
@@ -32,6 +32,8 @@ public:
 		return res.str();
 	}
 
+	RowInputStreamPtr & getRowInput() { return row_input; }
+
 protected:
 	Block readImpl() override;
 
diff --git a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
index 41a613800b4..e3ac34a7d64 100644
--- a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
+++ b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
@@ -23,12 +23,41 @@ public:
 	bool read(Row & row) override;
 	void readPrefix() override;
 
+	/** В случае исключения при парсинге, вы можете вызвать эту функцию.
+	  * Она выполняет заново парсинг последних двух строк и выводит подробную информацию о том, что происходит.
+	  */
+	void printDiagnosticInfo(WriteBuffer & out);
+
 private:
 	ReadBuffer & istr;
 	const Block sample;
 	bool with_names;
 	bool with_types;
 	DataTypes data_types;
+
+	/// Для удобной диагностики в случае ошибки.
+
+	size_t row_num = 0;
+
+	/// Сколько байт было считано, не считая тех, что ещё в буфере.
+	size_t bytes_read_at_start_of_buffer_on_current_row = 0;
+	size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
+
+	BufferBase::Position pos_of_current_row = nullptr;
+	BufferBase::Position pos_of_prev_row = nullptr;
+
+	void updateDiagnosticInfo()
+	{
+		++row_num;
+
+		bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
+		bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
+
+		pos_of_prev_row = pos_of_current_row;
+		pos_of_current_row = istr.position();
+	}
+
+	bool parseRowAndPrintDiagnosticInfo(WriteBuffer & out);
 };
 
 }
diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp
index 7172a0bfc20..622ee490fb8 100644
--- a/dbms/src/Client/Client.cpp
+++ b/dbms/src/Client/Client.cpp
@@ -38,6 +38,8 @@
 #include <DB/IO/ReadBufferFromIStream.h>
 
 #include <DB/DataStreams/AsynchronousBlockInputStream.h>
+#include <DB/DataStreams/BlockInputStreamFromRowInputStream.h>
+#include <DB/DataStreams/TabSeparatedRowInputStream.h>
 
 #include <DB/Parsers/ParserQuery.h>
 #include <DB/Parsers/ASTSetQuery.h>
@@ -687,21 +689,46 @@ private:
 			if (!insert->format.empty())
 				current_format = insert->format;
 
-		BlockInputStreamPtr block_std_in = new AsynchronousBlockInputStream(context.getFormatFactory().getInput(
-			current_format, buf, sample, insert_format_max_block_size, context.getDataTypeFactory()));
-		block_std_in->readPrefix();
+		BlockInputStreamPtr block_input = context.getFormatFactory().getInput(
+			current_format, buf, sample, insert_format_max_block_size, context.getDataTypeFactory());
 
-		while (true)
+		BlockInputStreamPtr async_block_input = new AsynchronousBlockInputStream(block_input);
+
+		try
 		{
-			Block block = block_std_in->read();
-			connection->sendData(block);
-			processed_rows += block.rows();
+			async_block_input->readPrefix();
 
-			if (!block)
-				break;
+			while (true)
+			{
+				Block block = async_block_input->read();
+				connection->sendData(block);
+				processed_rows += block.rows();
+
+				if (!block)
+					break;
+			}
+
+			async_block_input->readSuffix();
 		}
+		catch (...)		/// TODO Более точно
+		{
+			/** В частном случае - при использовании формата TabSeparated, мы можем вывести более подробную диагностику.
+			  */
 
-		block_std_in->readSuffix();
+			BlockInputStreamFromRowInputStream * concrete_block_input = dynamic_cast<BlockInputStreamFromRowInputStream *>(block_input.get());
+			if (!concrete_block_input)
+				throw;
+
+			RowInputStreamPtr & row_input = concrete_block_input->getRowInput();
+			TabSeparatedRowInputStream * concrete_row_input = dynamic_cast<TabSeparatedRowInputStream *>(row_input.get());
+			if (!concrete_row_input)
+				throw;
+
+			WriteBufferFromFileDescriptor stderr_out(STDERR_FILENO);
+			concrete_row_input->printDiagnosticInfo(stderr_out);
+
+			throw Exception("Cannot parse data in tab separated format.", ErrorCodes::SYNTAX_ERROR);
+		}
 	}
 
 
diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
index 65e7d2ef00b..bf5f37f0dcc 100644
--- a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
+++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
@@ -1,6 +1,8 @@
 #include <DB/IO/ReadHelpers.h>
+#include <DB/IO/Operators.h>
 
 #include <DB/DataStreams/TabSeparatedRowInputStream.h>
+#include <DB/DataTypes/DataTypesNumberFixed.h>
 
 
 namespace DB
@@ -45,9 +47,11 @@ void TabSeparatedRowInputStream::readPrefix()
 
 bool TabSeparatedRowInputStream::read(Row & row)
 {
+	updateDiagnosticInfo();
+
 	size_t size = data_types.size();
 	row.resize(size);
-	
+
 	for (size_t i = 0; i < size; ++i)
 	{
 		if (i == 0 && istr.eof())
@@ -55,7 +59,7 @@ bool TabSeparatedRowInputStream::read(Row & row)
 			row.clear();
 			return false;
 		}
-		
+
 		data_types[i]->deserializeTextEscaped(row[i], istr);
 
 		/// пропускаем разделители
@@ -71,4 +75,244 @@ bool TabSeparatedRowInputStream::read(Row & row)
 	return true;
 }
 
+
+void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
+{
+	/// Вывести подробную диагностику возможно лишь если последняя и предпоследняя строка ещё находятся в буфере для чтения.
+	size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
+	if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
+	{
+		out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
+		return;
+	}
+
+	/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.
+
+	if (pos_of_prev_row)
+	{
+		istr.position() = pos_of_prev_row;
+
+		out << "\nRow " << (row_num - 1) << ":\n";
+		if (!parseRowAndPrintDiagnosticInfo(out))
+			return;
+	}
+	else
+	{
+		if (!pos_of_current_row)
+		{
+			out << "Could not print diagnostic info because parsing of data hasn't started.\n";
+			return;
+		}
+
+		istr.position() = pos_of_current_row;
+	}
+
+	out << "\nRow " << row_num << ":\n";
+	parseRowAndPrintDiagnosticInfo(out);
+	out << "\n";
+}
+
+
+static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
+{
+	if (end == begin)
+	{
+		out << "<EMPTY>";
+		return;
+	}
+
+	out << "\"";
+
+	for (auto pos = begin; pos < end; ++pos)
+	{
+		switch (*pos)
+		{
+			case '\0':
+				out << "<ASCII NUL>";
+				break;
+			case '\b':
+				out << "<BACKSPACE>";
+				break;
+			case '\f':
+				out << "<FORM FEED>";
+				break;
+			case '\n':
+				out << "<LINE FEED>";
+				break;
+			case '\r':
+				out << "<CARRIAGE RETURN>";
+				break;
+			case '\t':
+				out << "<TAB>";
+				break;
+			case '\\':
+				out << "<BACKSLASH>";
+				break;
+			case '"':
+				out << "<DOUBLE QUOTE>";
+				break;
+			case '\'':
+				out << "<SINGLE QUOTE>";
+				break;
+
+			default:
+			{
+				if (*pos >= 0 && *pos < 32)
+				{
+					static const char * hex = "0123456789ABCDEF";
+					out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
+				}
+				else
+					out << *pos;
+			}
+		}
+	}
+
+	out << "\"";
+}
+
+
+bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(WriteBuffer & out)
+{
+	size_t size = data_types.size();
+	for (size_t i = 0; i < size; ++i)
+	{
+		if (i == 0 && istr.eof())
+		{
+			out << "<End of stream>\n";
+			return false;
+		}
+
+		out << "Column " << i << ", name: " << sample.getByPosition(i).name << ", type: " << data_types[i]->getName();
+
+		auto prev_position = istr.position();
+		std::exception_ptr exception;
+
+		Field field;
+		try
+		{
+			data_types[i]->deserializeTextEscaped(field, istr);
+		}
+		catch (...)
+		{
+			exception = std::current_exception();
+		}
+
+		auto curr_position = istr.position();
+
+		if (curr_position < prev_position)
+			throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
+
+		if (data_types[i]->isNumeric())
+		{
+			/// Пустая строка вместо числа.
+			if (curr_position == prev_position)
+			{
+				out << ", ERROR: text ";
+				verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
+				out << " is not like number\n";
+				return false;
+			}
+		}
+
+		out << ", parsed text: ";
+		verbosePrintString(prev_position, curr_position, out);
+
+		if (exception)
+		{
+			out << ", ERROR\n";
+			return false;
+		}
+
+		out << " as " << apply_visitor(FieldVisitorToString(), field) << "\n";
+
+		if (data_types[i]->isNumeric())
+		{
+			if (*curr_position != '\n' && *curr_position != '\t')
+			{
+				out << "ERROR: garbage after number: ";
+				verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
+				out << "\n";
+				return false;
+			}
+		}
+
+		if (   (typeid_cast<const DataTypeUInt8  *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt8>::max())
+			|| (typeid_cast<const DataTypeUInt16 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt16>::max())
+			|| (typeid_cast<const DataTypeUInt32 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt32>::max())
+			|| (typeid_cast<const DataTypeInt8 *>(data_types[i].get())
+				&& (field.get<Int64>() > std::numeric_limits<Int8>::max() || field.get<Int64>() < std::numeric_limits<Int8>::min()))
+			|| (typeid_cast<const DataTypeInt16 *>(data_types[i].get())
+				&& (field.get<Int64>() > std::numeric_limits<Int16>::max() || field.get<Int64>() < std::numeric_limits<Int16>::min()))
+			|| (typeid_cast<const DataTypeInt32 *>(data_types[i].get())
+				&& (field.get<Int64>() > std::numeric_limits<Int32>::max() || field.get<Int64>() < std::numeric_limits<Int32>::min())))
+		{
+			out << "ERROR: parsed number is out of range of data type.\n";
+			return false;
+		}
+
+		/// Разделители
+		if (i + 1 == size)
+		{
+			if (!istr.eof())
+			{
+				try
+				{
+					assertString("\n", istr);
+				}
+				catch (const DB::Exception &)
+				{
+					if (*istr.position() == '\t')
+					{
+						out << "ERROR: Tab found where line feed is expected."
+							" It's like your file has more columns than expected.\n"
+							"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
+					}
+					else if (*istr.position() == '\r')
+					{
+						out << "ERROR: Carriage return found where line feed is expected."
+							" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
+					}
+					else
+					{
+						out << "ERROR: There is no line feed. ";
+						verbosePrintString(istr.position(), istr.position() + 1, out);
+						out << " found instead.\n";
+					}
+					return false;
+				}
+			}
+		}
+		else
+		{
+			try
+			{
+				assertString("\t", istr);
+			}
+			catch (const DB::Exception &)
+			{
+				if (*istr.position() == '\n')
+				{
+					out << "ERROR: Line feed found where tab is expected."
+						" It's like your file has less columns than expected.\n"
+						"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
+				}
+				else if (*istr.position() == '\r')
+				{
+					out << "ERROR: Carriage return found where tab is expected.\n";
+				}
+				else
+				{
+					out << "ERROR: There is no tab. ";
+					verbosePrintString(istr.position(), istr.position() + 1, out);
+					out << " found instead.\n";
+				}
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
 }

From 3d80f45b71791be948cddd48df48a3ca580d7b6a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 11:44:04 +0300
Subject: [PATCH 50/55] dbms: better diagnostics of errors in TabSeparated
 format (development) [#METR-15574].

---
 .../DataStreams/TabSeparatedRowInputStream.h  |  2 +-
 dbms/src/Client/Client.cpp                    |  3 +-
 .../TabSeparatedRowInputStream.cpp            | 62 ++++++++++++++++---
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
index e3ac34a7d64..a69b4452d9a 100644
--- a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
+++ b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
@@ -57,7 +57,7 @@ private:
 		pos_of_current_row = istr.position();
 	}
 
-	bool parseRowAndPrintDiagnosticInfo(WriteBuffer & out);
+	bool parseRowAndPrintDiagnosticInfo(WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
 };
 
 }
diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp
index 622ee490fb8..23ee24dfd68 100644
--- a/dbms/src/Client/Client.cpp
+++ b/dbms/src/Client/Client.cpp
@@ -59,6 +59,7 @@
 #include <DB/Common/ExternalTable.h>
 #include <DB/Common/UnicodeBar.h>
 #include <DB/Common/formatReadable.h>
+#include <DB/Columns/ColumnString.h>
 
 
 /// http://en.wikipedia.org/wiki/ANSI_escape_code
@@ -727,7 +728,7 @@ private:
 			WriteBufferFromFileDescriptor stderr_out(STDERR_FILENO);
 			concrete_row_input->printDiagnosticInfo(stderr_out);
 
-			throw Exception("Cannot parse data in tab separated format.", ErrorCodes::SYNTAX_ERROR);
+			throw;
 		}
 	}
 
diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
index bf5f37f0dcc..6fb8e0f3bd0 100644
--- a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
+++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
@@ -45,6 +45,19 @@ void TabSeparatedRowInputStream::readPrefix()
 }
 
 
+/** Проверка на распространённый случай ошибки - использование Windows перевода строки.
+  */
+static void checkForCarriageReturn(ReadBuffer & istr)
+{
+	if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
+		throw Exception("You have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
+			" It's like your input data have DOS/Windows style line separators, that are illegal in TabSeparated format."
+			" You must transform your file to Unix format."
+			" But if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
+			ErrorCodes::INCORRECT_DATA);
+}
+
+
 bool TabSeparatedRowInputStream::read(Row & row)
 {
 	updateDiagnosticInfo();
@@ -66,7 +79,12 @@ bool TabSeparatedRowInputStream::read(Row & row)
 		if (i + 1 == size)
 		{
 			if (!istr.eof())
+			{
+				if (unlikely(row_num == 1))
+					checkForCarriageReturn(istr);
+
 				assertString("\n", istr);
+			}
 		}
 		else
 			assertString("\t", istr);
@@ -86,6 +104,16 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
 		return;
 	}
 
+	size_t max_length_of_column_name = 0;
+	for (size_t i = 0; i < sample.columns(); ++i)
+		if (sample.getByPosition(i).name.size() > max_length_of_column_name)
+			max_length_of_column_name = sample.getByPosition(i).name.size();
+
+	size_t max_length_of_data_type_name = 0;
+	for (size_t i = 0; i < sample.columns(); ++i)
+		if (sample.getByPosition(i).type->getName().size() > max_length_of_data_type_name)
+			max_length_of_data_type_name = sample.getByPosition(i).type->getName().size();
+
 	/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.
 
 	if (pos_of_prev_row)
@@ -93,7 +121,7 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
 		istr.position() = pos_of_prev_row;
 
 		out << "\nRow " << (row_num - 1) << ":\n";
-		if (!parseRowAndPrintDiagnosticInfo(out))
+		if (!parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name))
 			return;
 	}
 	else
@@ -108,7 +136,7 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
 	}
 
 	out << "\nRow " << row_num << ":\n";
-	parseRowAndPrintDiagnosticInfo(out);
+	parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name);
 	out << "\n";
 }
 
@@ -172,7 +200,8 @@ static void verbosePrintString(BufferBase::Position begin, BufferBase::Position
 }
 
 
-bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(WriteBuffer & out)
+bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
+	WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
 {
 	size_t size = data_types.size();
 	for (size_t i = 0; i < size; ++i)
@@ -183,7 +212,9 @@ bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(WriteBuffer & ou
 			return false;
 		}
 
-		out << "Column " << i << ", name: " << sample.getByPosition(i).name << ", type: " << data_types[i]->getName();
+		out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
+			<< "name: " << sample.getByPosition(i).name << ", " << std::string(max_length_of_column_name - sample.getByPosition(i).name.size(), ' ')
+			<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
 
 		auto prev_position = istr.position();
 		std::exception_ptr exception;
@@ -208,31 +239,42 @@ bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(WriteBuffer & ou
 			/// Пустая строка вместо числа.
 			if (curr_position == prev_position)
 			{
-				out << ", ERROR: text ";
+				out << "ERROR: text ";
 				verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
-				out << " is not like number\n";
+				out << " is not like " << data_types[i]->getName() << "\n";
 				return false;
 			}
 		}
 
-		out << ", parsed text: ";
+		out << "parsed text: ";
 		verbosePrintString(prev_position, curr_position, out);
 
 		if (exception)
 		{
-			out << ", ERROR\n";
+			if (data_types[i]->getName() == "DateTime")
+				out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
+			else if (data_types[i]->getName() == "Date")
+				out << "ERROR: Date must be in YYYY-MM-DD format.\n";
+			else
+				out << "ERROR\n";
 			return false;
 		}
 
-		out << " as " << apply_visitor(FieldVisitorToString(), field) << "\n";
+		out << "\n";
 
 		if (data_types[i]->isNumeric())
 		{
 			if (*curr_position != '\n' && *curr_position != '\t')
 			{
-				out << "ERROR: garbage after number: ";
+				out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
 				verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
 				out << "\n";
+
+				if (data_types[i]->getName() == "DateTime")
+					out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss format.\n";
+				else if (data_types[i]->getName() == "Date")
+					out << "ERROR: Date must be in YYYY-MM-DD format.\n";
+
 				return false;
 			}
 		}

From be630f9faebe1b9f0c2052ae49d66744022ebd8f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 12:02:24 +0300
Subject: [PATCH 51/55] dbms: better diagnostics of errors in TabSeparated
 format [#METR-15574].

---
 .../DataStreams/TabSeparatedRowInputStream.h  | 10 ++--
 dbms/src/Client/Client.cpp                    | 40 +++----------
 .../TabSeparatedRowInputStream.cpp            | 56 ++++++++++++-------
 3 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
index a69b4452d9a..1b894948411 100644
--- a/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
+++ b/dbms/include/DB/DataStreams/TabSeparatedRowInputStream.h
@@ -23,11 +23,6 @@ public:
 	bool read(Row & row) override;
 	void readPrefix() override;
 
-	/** В случае исключения при парсинге, вы можете вызвать эту функцию.
-	  * Она выполняет заново парсинг последних двух строк и выводит подробную информацию о том, что происходит.
-	  */
-	void printDiagnosticInfo(WriteBuffer & out);
-
 private:
 	ReadBuffer & istr;
 	const Block sample;
@@ -46,6 +41,11 @@ private:
 	BufferBase::Position pos_of_current_row = nullptr;
 	BufferBase::Position pos_of_prev_row = nullptr;
 
+	/** В случае исключения при парсинге, вызывается эта функция.
+	  * Она выполняет заново парсинг последних двух строк и выводит подробную информацию о том, что происходит.
+	  */
+	void printDiagnosticInfo(WriteBuffer & out);
+
 	void updateDiagnosticInfo()
 	{
 		++row_num;
diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp
index 23ee24dfd68..e86326bc70c 100644
--- a/dbms/src/Client/Client.cpp
+++ b/dbms/src/Client/Client.cpp
@@ -695,41 +695,19 @@ private:
 
 		BlockInputStreamPtr async_block_input = new AsynchronousBlockInputStream(block_input);
 
-		try
+		async_block_input->readPrefix();
+
+		while (true)
 		{
-			async_block_input->readPrefix();
+			Block block = async_block_input->read();
+			connection->sendData(block);
+			processed_rows += block.rows();
 
-			while (true)
-			{
-				Block block = async_block_input->read();
-				connection->sendData(block);
-				processed_rows += block.rows();
-
-				if (!block)
-					break;
-			}
-
-			async_block_input->readSuffix();
+			if (!block)
+				break;
 		}
-		catch (...)		/// TODO Более точно
-		{
-			/** В частном случае - при использовании формата TabSeparated, мы можем вывести более подробную диагностику.
-			  */
 
-			BlockInputStreamFromRowInputStream * concrete_block_input = dynamic_cast<BlockInputStreamFromRowInputStream *>(block_input.get());
-			if (!concrete_block_input)
-				throw;
-
-			RowInputStreamPtr & row_input = concrete_block_input->getRowInput();
-			TabSeparatedRowInputStream * concrete_row_input = dynamic_cast<TabSeparatedRowInputStream *>(row_input.get());
-			if (!concrete_row_input)
-				throw;
-
-			WriteBufferFromFileDescriptor stderr_out(STDERR_FILENO);
-			concrete_row_input->printDiagnosticInfo(stderr_out);
-
-			throw;
-		}
+		async_block_input->readSuffix();
 	}
 
 
diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
index 6fb8e0f3bd0..93f917e4144 100644
--- a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
+++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp
@@ -50,10 +50,10 @@ void TabSeparatedRowInputStream::readPrefix()
 static void checkForCarriageReturn(ReadBuffer & istr)
 {
 	if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
-		throw Exception("You have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
-			" It's like your input data have DOS/Windows style line separators, that are illegal in TabSeparated format."
+		throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
+			"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
 			" You must transform your file to Unix format."
-			" But if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
+			"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
 			ErrorCodes::INCORRECT_DATA);
 }
 
@@ -65,29 +65,43 @@ bool TabSeparatedRowInputStream::read(Row & row)
 	size_t size = data_types.size();
 	row.resize(size);
 
-	for (size_t i = 0; i < size; ++i)
+	try
 	{
-		if (i == 0 && istr.eof())
+		for (size_t i = 0; i < size; ++i)
 		{
-			row.clear();
-			return false;
-		}
-
-		data_types[i]->deserializeTextEscaped(row[i], istr);
-
-		/// пропускаем разделители
-		if (i + 1 == size)
-		{
-			if (!istr.eof())
+			if (i == 0 && istr.eof())
 			{
-				if (unlikely(row_num == 1))
-					checkForCarriageReturn(istr);
-
-				assertString("\n", istr);
+				row.clear();
+				return false;
 			}
+
+			data_types[i]->deserializeTextEscaped(row[i], istr);
+
+			/// пропускаем разделители
+			if (i + 1 == size)
+			{
+				if (!istr.eof())
+				{
+					if (unlikely(row_num == 1))
+						checkForCarriageReturn(istr);
+
+					assertString("\n", istr);
+				}
+			}
+			else
+				assertString("\t", istr);
 		}
-		else
-			assertString("\t", istr);
+	}
+	catch (Exception & e)
+	{
+		String verbose_diagnostic;
+		{
+			WriteBufferFromString diagnostic_out(verbose_diagnostic);
+			printDiagnosticInfo(diagnostic_out);
+		}
+
+		e.addMessage("\n" + verbose_diagnostic);
+		throw;
 	}
 
 	return true;

From d3d3329d794da5fb58c5260434325bf5dd193522 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 29 Mar 2015 12:20:28 +0300
Subject: [PATCH 52/55] dbms: updated default setting for compiler
 [#METR-2944].

---
 dbms/include/DB/Interpreters/Settings.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/include/DB/Interpreters/Settings.h b/dbms/include/DB/Interpreters/Settings.h
index f14bf61d7dc..0ade61bdd1e 100644
--- a/dbms/include/DB/Interpreters/Settings.h
+++ b/dbms/include/DB/Interpreters/Settings.h
@@ -86,7 +86,7 @@ struct Settings
 	/** Включена ли компиляция запросов. */ \
 	M(SettingBool, compile, false) \
 	/** Количество одинаковых по структуре запросов перед тем, как инициируется их компиляция. */ \
-	M(SettingUInt64, min_count_to_compile, 0) \
+	M(SettingUInt64, min_count_to_compile, 3) \
 	/** При каком количестве ключей, начинает использоваться двухуровневая агрегация. 0 - никогда не использовать. */ \
 	M(SettingUInt64, group_by_two_level_threshold, 100000) \
 	\

From 05c426aa6ba182be42ca21f9d83b78f6b38c3f52 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 30 Mar 2015 08:50:13 +0300
Subject: [PATCH 53/55] dbms: updated benchmark instructions for Vertica
 [#METR-15716].

---
 dbms/benchmark/create_dump.sh                 | 19 +----
 dbms/benchmark/vertica/README                 | 40 +++++++++++
 dbms/benchmark/vertica/expect.tcl             | 21 ------
 dbms/benchmark/vertica/hits_define_schema.sql | 40 +++++------
 dbms/benchmark/vertica/queries_for_script.sql | 70 +------------------
 5 files changed, 59 insertions(+), 131 deletions(-)
 create mode 100644 dbms/benchmark/vertica/README
 delete mode 100644 dbms/benchmark/vertica/expect.tcl

diff --git a/dbms/benchmark/create_dump.sh b/dbms/benchmark/create_dump.sh
index 3b42fb5716d..287ecb1bf46 100755
--- a/dbms/benchmark/create_dump.sh
+++ b/dbms/benchmark/create_dump.sh
@@ -1,18 +1,3 @@
-path=/opt/dump/dump_0.3
-db_name=hits_1b
-num=1000000000
+#!/bin/bash
 
-dump_replaced=$path/dump_"$db_name"_replaced.tsv
-dump_meshed=$path/dump_"$db_name"_meshed.tsv
-dump_meshed_utf8=$path/dump_"$db_name"_meshed_utf8.tsv
-
-clickhouse-client --query="SET GLOBAL max_block_size=100000"
-clickhouse-client --query="SET GLOBAL max_threads=1"
-
-clickhouse-client --query="SELECT toInt64(WatchID), JavaEnable, Title, GoodEvent, (EventTime < toDateTime('1971-01-01 00:00:00') ? toDateTime('1971-01-01 00:00:01') : EventTime), (EventDate < toDate('1971-01-01') ? toDate('1971-01-01') : EventDate), CounterID, ClientIP, RegionID, toInt64(UserID), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, (ClientEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : ClientEventTime), SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, toInt64(FUniqID), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, (LocalEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : LocalEventTime), Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, toInt64(RefererHash), toInt64(URLHash), CLID, toInt64(intHash32(UserID)) FROM hits_mt_test_1b LIMIT $num FORMAT TabSeparated" > $dump_replaced
-
-/etc/init.d/clickhouse-server-metrika-yandex-ulimit restart
-
-sudo nsort -format=maximum_size:65535 -k1 -T /opt -o $dump_meshed $dump_replaced
-
-cat $dump_meshed | iconv -futf8 -tutf8//IGNORE 2>/dev/null 1> $dump_meshed_utf8
\ No newline at end of file
+table=hits_10m; time clickhouse-client --max_bytes_before_external_sort=30000000000 --query="SELECT toInt64(WatchID), JavaEnable, Title, GoodEvent, (EventTime < toDateTime('1971-01-01 00:00:00') ? toDateTime('1971-01-01 00:00:01') : EventTime), (EventDate < toDate('1971-01-01') ? toDate('1971-01-01') : EventDate), CounterID, ClientIP, RegionID, toInt64(UserID), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, (ClientEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : ClientEventTime), SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, toInt64(FUniqID), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, (LocalEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : LocalEventTime), Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, toInt64(RefererHash), toInt64(URLHash), CLID FROM $table ORDER BY rand()" | corrector_utf8 > /opt/dumps/${table}_corrected.tsv
diff --git a/dbms/benchmark/vertica/README b/dbms/benchmark/vertica/README
new file mode 100644
index 00000000000..a6753f903b2
--- /dev/null
+++ b/dbms/benchmark/vertica/README
@@ -0,0 +1,40 @@
+Quick installation instructions
+-------------------------------
+
+Register on my.vertica.com
+https://my.vertica.com/download-community-edition/
+Download HP Vertica 7.1.1 Analytic Database Server, Debian or Ubuntu 14.04 version.
+
+sudo apt-get install sysstat pstack mcelog
+sudo dpkg -i vertica_7.1.1-0_amd64.deb
+sudo sh -c "echo 'export TZ=Europe/Moscow' >> /home/dbadmin/.bash_profile"
+sudo /opt/vertica/sbin/install_vertica --hosts=127.0.0.1 --failure-threshold=NONE
+sudo mkdir /opt/vertica-data/
+sudo chown dbadmin /opt/vertica-data/
+
+sudo su dbadmin
+/opt/vertica/bin/adminTools
+
+configuration menu
+create database
+name: default
+empty password
+both directories: /opt/vertica-data/
+main menu
+exit
+
+PS. Note that Vertica doesn't support IPv6.
+
+How to prepare data
+-------------------
+
+Prepare dumps with script create_dump.sh for tables hits_10m, hits_100m, hits_1000m. It takes about 5 hours (1m41.882s, 25m11.103s, ).
+Start vsql command line client.
+Create tables with queries from hits_define_schema.sql.
+
+Time to insert data:
+hits_10m: 91 sec.
+hits_100m: 774 sec.
+hits_1000m: 
+
+You need to validate number of rows with SELECT count(*).
diff --git a/dbms/benchmark/vertica/expect.tcl b/dbms/benchmark/vertica/expect.tcl
deleted file mode 100644
index 2b9b0a14011..00000000000
--- a/dbms/benchmark/vertica/expect.tcl
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#!/bin/expect
-
-# Set timeout
-set timeout 600
-
-# Get arguments
-set query [lindex $argv 0]
-
-spawn vsql -eU dbadmin
-
-expect "dbadmin=>"
-send "\\timing\r"
-
-expect "dbadmin=>"
-send "$query\r"
-
-expect "dbadmin=>"
-send "\\q\r"
-
-expect eof
\ No newline at end of file
diff --git a/dbms/benchmark/vertica/hits_define_schema.sql b/dbms/benchmark/vertica/hits_define_schema.sql
index e230ce43e31..37c9d45ffca 100644
--- a/dbms/benchmark/vertica/hits_define_schema.sql
+++ b/dbms/benchmark/vertica/hits_define_schema.sql
@@ -1,6 +1,6 @@
 \timing
 
-create table hits_10m_meshed
+create table hits_10m
 (
     WatchID INTEGER,
     JavaEnable INTEGER,
@@ -106,17 +106,14 @@ create table hits_10m_meshed
     HasGCLID INTEGER,
     RefererHash INTEGER,
     URLHash INTEGER,
-    CLID INTEGER, 
-    UserIDHash INTEGER
-) ORDER BY CounterID, EventDate, UserIDHash, EventTime;
+    CLID INTEGER
+) ORDER BY CounterID, EventDate, UserID, EventTime;
+
+\set input_file '''/opt/dumps/hits_10m_corrected.tsv'''
+COPY hits_10m FROM :input_file DELIMITER E'\t' DIRECT;
 
 
-\set input_file '''/opt/dump/dump_0.3/dump_hits_10m_meshed.tsv''' 
-COPY hits_10m_meshed FROM :input_file DELIMITER E'\t' DIRECT;
-
-
-
-create table hits_100m_meshed
+create table hits_100m
 (
     WatchID INTEGER,
     JavaEnable INTEGER,
@@ -222,17 +219,14 @@ create table hits_100m_meshed
     HasGCLID INTEGER,
     RefererHash INTEGER,
     URLHash INTEGER,
-    CLID INTEGER, 
-    UserIDHash INTEGER
-) ORDER BY CounterID, EventDate, UserIDHash, EventTime;;
+    CLID INTEGER
+) ORDER BY CounterID, EventDate, UserID, EventTime;
 
-\set input_file '''/opt/dump/dump_0.3/dump_hits_100m_meshed.tsv''' 
-COPY hits_100m_meshed FROM :input_file DELIMITER E'\t' DIRECT;
+\set input_file '''/opt/dumps/hits_100m_corrected.tsv'''
+COPY hits_100m FROM :input_file DELIMITER E'\t' DIRECT;
 
 
-
-
-create table hits_1b_meshed
+create table hits_1000m
 (
     WatchID INTEGER,
     JavaEnable INTEGER,
@@ -338,10 +332,8 @@ create table hits_1b_meshed
     HasGCLID INTEGER,
     RefererHash INTEGER,
     URLHash INTEGER,
-    CLID INTEGER, 
-    UserIDHash INTEGER
-) ORDER BY CounterID, EventDate, UserIDHash, EventTime;
+    CLID INTEGER
+) ORDER BY CounterID, EventDate, UserID, EventTime;
 
-
-\set input_file '''/opt/dump/dump_0.3/dump_hits_1b_meshed.tsv''' 
-COPY hits_1b_meshed FROM :input_file DELIMITER E'\t' DIRECT;
+\set input_file '''/opt/dumps/hits_1000m_corrected.tsv''' 
+COPY hits_1000m FROM :input_file DELIMITER E'\t' DIRECT;
diff --git a/dbms/benchmark/vertica/queries_for_script.sql b/dbms/benchmark/vertica/queries_for_script.sql
index 654faaeef58..7eb238f9e4f 100644
--- a/dbms/benchmark/vertica/queries_for_script.sql
+++ b/dbms/benchmark/vertica/queries_for_script.sql
@@ -5,107 +5,39 @@ SELECT sum_float(UserID) FROM hits_100m_meshed;
 SELECT count(DISTINCT UserID) FROM hits_100m_meshed;
 SELECT count(DISTINCT SearchPhrase) FROM hits_100m_meshed;
 SELECT min(EventDate), max(EventDate) FROM hits_100m_meshed;
-
 SELECT AdvEngineID, count(*) FROM hits_100m_meshed WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
--- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
-
 SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_100m_meshed GROUP BY RegionID ORDER BY u DESC LIMIT 10;
--- агрегация, среднее количество ключей.;
-
 SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_100m_meshed GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
--- агрегация, среднее количество ключей, несколько агрегатных функций.;
-
 SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
--- мощная фильтрация по строкам, затем агрегация по строкам.;
-
 SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
--- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
-
 SELECT SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
-
 SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
--- агрегация чуть сложнее.;
-
 SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- агрегация по числу и строке, большое количество ключей.;
-
 SELECT UserID, count(*) FROM hits_100m_meshed GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
--- агрегация по очень большому количеству ключей, может не хватить оперативки.;
-
 SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- ещё более сложная агрегация.;
-
 SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase LIMIT 10;
--- то же самое, но без сортировки.;
-
 SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
-
 SELECT UserID FROM hits_100m_meshed WHERE UserID = 12345678901234567890;
--- мощная фильтрация по столбцу типа UInt64.;
-
 SELECT count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%';
--- фильтрация по поиску подстроки в строке.;
-
 SELECT SearchPhrase, MAX(URL), count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- вынимаем большие столбцы, фильтрация по строке.;
-
 SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_100m_meshed WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
--- чуть больше столбцы.;
-
 SELECT * FROM hits_100m_meshed WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
--- плохой запрос - вынимаем все столбцы.;
-
 SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
--- большая сортировка.;
-
 SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
--- большая сортировка по строкам.;
-
 SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
--- большая сортировка по кортежу.;
-
 SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_100m_meshed WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
--- считаем средние длины URL для крупных счётчиков.;
-
 SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS key, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m_meshed WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
--- то же самое, но с разбивкой по доменам.;
-
 SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_100m_meshed;
--- много тупых агрегатных функций.;
-
 SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
--- сложная агрегация, для больших таблиц может не хватить оперативки.;
-
 SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
--- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
-
 SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
--- то же самое, но ещё и без фильтрации.;
-
 SELECT URL, count(*) FROM hits_100m_meshed GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
--- агрегация по URL.;
-
 SELECT 1, URL, count(*) FROM hits_100m_meshed GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
--- агрегация по URL и числу.;
-
 SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_100m_meshed GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
- 
 SELECT URL,     count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
-
-
 SELECT Title, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
-
-
 SELECT URL, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
-
-
 SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN  Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000; 
-
-
 SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash =  6202628419148573758  GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000; 
-
 SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash =  6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000; 
-
-SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute; 
\ No newline at end of file
+SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

From 03d86006b981f491ff825a503b07f8b250b4cd6a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 30 Mar 2015 14:27:22 +0300
Subject: [PATCH 54/55] dbms: added new benchmark results (incomplete)
 [#METR-15716].

---
 dbms/benchmark/vertica/README                 |  2 +-
 dbms/benchmark/vertica/benchmark.sh           | 24 ++++++
 dbms/benchmark/vertica/queries_for_script.sql | 86 +++++++++----------
 3 files changed, 68 insertions(+), 44 deletions(-)
 create mode 100644 dbms/benchmark/vertica/benchmark.sh

diff --git a/dbms/benchmark/vertica/README b/dbms/benchmark/vertica/README
index a6753f903b2..7573fc25101 100644
--- a/dbms/benchmark/vertica/README
+++ b/dbms/benchmark/vertica/README
@@ -28,7 +28,7 @@ PS. Note that Vertica doesn't support IPv6.
 How to prepare data
 -------------------
 
-Prepare dumps with script create_dump.sh for tables hits_10m, hits_100m, hits_1000m. It takes about 5 hours (1m41.882s, 25m11.103s, ).
+Prepare dumps with script create_dump.sh for tables hits_10m, hits_100m, hits_1000m. It takes about 5 hours (1m41.882s, 25m11.103s, 276m36.388s).
 Start vsql command line client.
 Create tables with queries from hits_define_schema.sql.
 
diff --git a/dbms/benchmark/vertica/benchmark.sh b/dbms/benchmark/vertica/benchmark.sh
new file mode 100644
index 00000000000..70f96ed3fbc
--- /dev/null
+++ b/dbms/benchmark/vertica/benchmark.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+QUERIES_FILE="queries.sql"
+TABLE=$1
+TRIES=3
+
+cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
+	sync
+	echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
+
+	echo -n "["
+	for i in $(seq 1 $TRIES); do
+
+		RES=$((echo '\timing'; echo "$query") |
+			/opt/vertica/bin/vsql -U dbadmin |
+			grep -oP 'All rows formatted: [^ ]+ ms' |
+			ssed -R -e 's/^All rows formatted: ([\d,]+) ms$/\1/' |
+			tr ',' '.')
+
+		[[ "$?" == "0" ]] && echo -n "$(perl -e "print ${RES} / 1000")" || echo -n "null"
+		[[ "$i" != $TRIES ]] && echo -n ", "
+	done
+	echo "],"
+done
diff --git a/dbms/benchmark/vertica/queries_for_script.sql b/dbms/benchmark/vertica/queries_for_script.sql
index 7eb238f9e4f..5f677b3c775 100644
--- a/dbms/benchmark/vertica/queries_for_script.sql
+++ b/dbms/benchmark/vertica/queries_for_script.sql
@@ -1,43 +1,43 @@
-SELECT count(*) FROM hits_100m_meshed;
-SELECT count(*) FROM hits_100m_meshed WHERE AdvEngineID != 0;
-SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_100m_meshed;
-SELECT sum_float(UserID) FROM hits_100m_meshed;
-SELECT count(DISTINCT UserID) FROM hits_100m_meshed;
-SELECT count(DISTINCT SearchPhrase) FROM hits_100m_meshed;
-SELECT min(EventDate), max(EventDate) FROM hits_100m_meshed;
-SELECT AdvEngineID, count(*) FROM hits_100m_meshed WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
-SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_100m_meshed GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_100m_meshed GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
-SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-SELECT SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT UserID, count(*) FROM hits_100m_meshed GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
-SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase LIMIT 10;
-SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT UserID FROM hits_100m_meshed WHERE UserID = 12345678901234567890;
-SELECT count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%';
-SELECT SearchPhrase, MAX(URL), count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_100m_meshed WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-SELECT * FROM hits_100m_meshed WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_100m_meshed WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS key, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m_meshed WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_100m_meshed;
-SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-SELECT URL, count(*) FROM hits_100m_meshed GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
-SELECT 1, URL, count(*) FROM hits_100m_meshed GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
-SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_100m_meshed GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
-SELECT URL,     count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
-SELECT Title, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
-SELECT URL, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
-SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN  Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000; 
-SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash =  6202628419148573758  GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000; 
-SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash =  6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000; 
-SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;
+SELECT count(*) FROM {table};
+SELECT count(*) FROM {table} WHERE AdvEngineID != 0;
+SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM {table};
+SELECT sum_float(UserID) FROM {table};
+SELECT count(DISTINCT UserID) FROM {table};
+SELECT count(DISTINCT SearchPhrase) FROM {table};
+SELECT min(EventDate), max(EventDate) FROM {table};
+SELECT AdvEngineID, count(*) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
+SELECT RegionID, count(DISTINCT UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
+SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM {table} GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
+SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+SELECT SearchEngineID, SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT UserID, count(*) FROM {table} GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
+SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
+SELECT count(*) FROM {table} WHERE URL LIKE '%metrika%';
+SELECT SearchPhrase, MAX(URL), count(*) FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
+SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
+SELECT CounterID, avg(length(URL)) AS l, count(*) FROM {table} WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS key, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
+SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
+SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
+SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
+SELECT URL, count(*) FROM {table} GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
+SELECT 1, URL, count(*) FROM {table} GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
+SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM {table} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
+SELECT URL,     count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
+SELECT Title, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
+SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN  Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000; 
+SELECT URLHash, EventDate, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash =  6202628419148573758  GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000; 
+SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash =  6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000; 
+SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

From 31ea01d2abeb003f6da9d15d868bcd72e7a6be3e Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 30 Mar 2015 14:30:12 +0300
Subject: [PATCH 55/55] dbms: removed old files [#METR-15716].

---
 dbms/benchmark/clickhouse/benchmark.sh | 366 -------------------------
 dbms/benchmark/clickhouse/conf.sh      |   4 -
 dbms/benchmark/clickhouse/expect.tcl   |  13 -
 dbms/benchmark/killif.sh               |  26 --
 dbms/benchmark/process_log.py          | 114 --------
 5 files changed, 523 deletions(-)
 delete mode 100755 dbms/benchmark/clickhouse/benchmark.sh
 delete mode 100644 dbms/benchmark/clickhouse/conf.sh
 delete mode 100644 dbms/benchmark/clickhouse/expect.tcl
 delete mode 100755 dbms/benchmark/killif.sh
 delete mode 100755 dbms/benchmark/process_log.py

diff --git a/dbms/benchmark/clickhouse/benchmark.sh b/dbms/benchmark/clickhouse/benchmark.sh
deleted file mode 100755
index 390f5a22d0d..00000000000
--- a/dbms/benchmark/clickhouse/benchmark.sh
+++ /dev/null
@@ -1,366 +0,0 @@
-#!/bin/bash
-
-test_table="hits_100m"
-
-start_date="'2013-07-01'"
-early_stop_date="'2013-07-02'"
-stop_date="'2013-07-31'"
-counter_id=34
-
-function run_ck_server
-{
-    sudo sh -c " ulimit -v 54000000; /etc/init.d/clickhouse-server restart"
-}
-
-# execute queries
-function execute()
-{
-    queries=("${@}")
-    queries_count=${#queries[@]}
-    
-    if [ -z $TIMES ]; then
-	TIMES=1
-    fi
-    
-    index=0
-    comment_re='\#.*'
-    while [ "$index" -lt "$queries_count" ]; do
-	query=${queries[$index]}
-
-	if [[ $query =~ $comment_re ]]; then
-	    echo "$query"
-	    echo
-	else
-	    sync
-	    sudo sh -c "echo 3 > /proc/sys/vm/drop_caches"
-
-	    for i in $(seq $TIMES)
-	    do
-	
-		expect -f ./expect.tcl "$query"
-		if [ "$?" != "0" ]; then
-		    echo "Error: $?"
-		    #break
-		fi
-
-		# restart clickhouse if failed
-		ps aux | grep -P '\d+ clickhouse-server'
-		if [ "$?" != "0" ]; then
-		    run_ck_server
-		fi
-	    done
-	fi
-
-	let "index = $index + 1"
-	echo "Ran $index queries." >&2
-    done
-}
-
-init_queries=(
-# DB structure with array arguments
-#"CREATE TABLE $test_table ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
-
-#DB structure without array arguments
-#"CREATE TABLE $test_table ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
-
-#modified table without uint
-"CREATE TABLE $test_table ( WatchID Int64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID Int64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID Int64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash Int64, URLHash Int64, CLID UInt32, UserIDHash UInt64 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
-
-)
-
-test_queries=(
-"SELECT count() FROM $test_table;"
-"SELECT count() FROM $test_table WHERE AdvEngineID != 0;"
-"SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM $test_table;"
-"SELECT sum(UserID) FROM $test_table;"
-"SELECT uniq(UserID) FROM $test_table;"
-"SELECT uniq(SearchPhrase) FROM $test_table;"
-"SELECT min(EventDate), max(EventDate) FROM $test_table;"
-
-"SELECT AdvEngineID, count() FROM $test_table WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;"
-"#- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;"
-
-"SELECT RegionID, uniq(UserID) AS u FROM $test_table GROUP BY RegionID ORDER BY u DESC LIMIT 10;"
-"#- агрегация, среднее количество ключей.;"
-
-"SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM $test_table GROUP BY RegionID ORDER BY c DESC LIMIT 10;"
-"#- агрегация, среднее количество ключей, несколько агрегатных функций.;"
-
-"SELECT MobilePhoneModel, uniq(UserID) AS u FROM $test_table WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;"
-"#- мощная фильтрация по строкам, затем агрегация по строкам.;"
-
-"SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM $test_table WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;"
-"#- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;"
-
-"SELECT SearchPhrase, count() AS c FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
-"#- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;"
-
-"SELECT SearchPhrase, uniq(UserID) AS u FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;"
-"#- агрегация чуть сложнее.;"
-
-"SELECT SearchEngineID, SearchPhrase, count() AS c FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;"
-"#- агрегация по числу и строке, большое количество ключей.;"
-
-"SELECT UserID, count() FROM $test_table GROUP BY UserID ORDER BY count() DESC LIMIT 10;"
-"#- агрегация по очень большому количеству ключей, может не хватить оперативки.;"
-
-"SELECT UserID, SearchPhrase, count() FROM $test_table GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;"
-"#- ещё более сложная агрегация.;"
-
-"SELECT UserID, SearchPhrase, count() FROM $test_table GROUP BY UserID, SearchPhrase LIMIT 10;"
-"#- то же самое, но без сортировки.;"
-
-"SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM $test_table GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;"
-"#- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;"
-
-"SELECT UserID FROM $test_table WHERE UserID = 12345678901234567890;"
-"#- мощная фильтрация по столбцу типа UInt64.;"
-
-"SELECT count() FROM $test_table WHERE URL LIKE '%metrika%';"
-"#- фильтрация по поиску подстроки в строке.;"
-
-"SELECT SearchPhrase, any(URL), count() AS c FROM $test_table WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
-"#- вынимаем большие столбцы, фильтрация по строке.;"
-
-"SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM $test_table WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
-"#- чуть больше столбцы.;"
-
-"SELECT * FROM $test_table WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;"
-"#- плохой запрос - вынимаем все столбцы.;"
-
-"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;"
-"#- большая сортировка.;"
-
-"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;"
-"#- большая сортировка по строкам.;"
-
-"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;"
-"#- большая сортировка по кортежу.;"
-
-"SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM $test_table WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;"
-"#- считаем средние длины URL для крупных счётчиков.;"
-
-"SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM $test_table WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;"
-"#- то же самое, но с разбивкой по доменам.;"
-
-"SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM $test_table;"
-"#- много тупых агрегатных функций.;"
-
-"SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;"
-"#- сложная агрегация, для больших таблиц может не хватить оперативки.;"
-
-"SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"
-"#- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;"
-
-"SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"
-"#- то же самое, но ещё и без фильтрации.;"
-
-"SELECT URL, count() AS c FROM $test_table GROUP BY URL ORDER BY c DESC LIMIT 10;"
-"#- агрегация по URL.;"
-
-"SELECT 1, URL, count() AS c FROM $test_table GROUP BY 1, URL ORDER BY c DESC LIMIT 10;"
-"#- агрегация по URL и числу.;"
-
-"SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM $test_table GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;"
- 
-"SELECT
-    URL,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT DontCountHits
-    AND NOT Refresh
-    AND notEmpty(URL)
-GROUP BY URL
-ORDER BY PageViews DESC
-LIMIT 10;"
-
-
-"SELECT
-    Title,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT DontCountHits
-    AND NOT Refresh
-    AND notEmpty(Title)
-GROUP BY Title
-ORDER BY PageViews DESC
-LIMIT 10;"
-
-"SELECT
-    URL,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT Refresh
-    AND IsLink
-    AND NOT IsDownload
-GROUP BY URL
-ORDER BY PageViews DESC
-LIMIT 1000;"
-
-"SELECT
-    TraficSourceID,
-    SearchEngineID,
-    AdvEngineID,
-    ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src,
-    URL AS Dst,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT Refresh
-GROUP BY
-    TraficSourceID,
-    SearchEngineID,
-    AdvEngineID,
-    Src,
-    Dst
-ORDER BY PageViews DESC
-LIMIT 1000;"
-
-"SELECT
-    URLHash,
-    EventDate,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT Refresh
-    AND TraficSourceID IN (-1, 6)
-    AND RefererHash = halfMD5('http://example.ru/')
-GROUP BY
-    URLHash,
-    EventDate
-ORDER BY PageViews DESC
-LIMIT 100000;"
-
-
-"SELECT
-    WindowClientWidth,
-    WindowClientHeight,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($stop_date)
-    AND NOT Refresh
-    AND NOT DontCountHits
-    AND URLHash = halfMD5('http://example.ru/')
-GROUP BY
-    WindowClientWidth,
-    WindowClientHeight
-ORDER BY PageViews DESC
-LIMIT 10000;"
-
-"SELECT
-    toStartOfMinute(EventTime) AS Minute,
-    count() AS PageViews
-FROM $test_table
-WHERE
-    CounterID = $counter_id
-    AND EventDate >= toDate($start_date)
-    AND EventDate <= toDate($early_stop_date)
-    AND NOT Refresh
-    AND NOT DontCountHits
-GROUP BY
-    Minute
-ORDER BY Minute;"
-
-)
-
-function test {
-    TIMES=3
-    execute "${test_queries[@]}"
-}
-
-function init {
-    execute "${init_queries[@]}"
-}
-
-function debug {
-    TIMES=3
-    debug_queries=(
-)
-    execute "${debug_queries[@]}"
-}
-
-function usage {
-    cat <<EOF   
-usage: $0 options
-
-This script run benhmark for clickhouse
-
-OPTIONS:
-   -h            Show this message
-   -d            Run debug queries
-   -i            Init database
-   -p log_file   Parse log file to columns with result
-   -t            Run tests
-EOF
-}
-
-function parse_log {
-   results=$(cat $1 |  grep -P 'Elapsed: \d+.\d+ ' | awk '{print $6}')
-  
-   index=1
-   for res in $results
-   do
-      echo -n "$res "
-      let "index=$index % 3"
-      if [ "$index" == "0" ]; then
-	  echo 
-      fi
-      let "index=$index + 1"
-   done
-}
-
-if [ "$#" == "0" ]; then
-    usage
-    exit 0
-fi
-
-echo "Start date" $(date)
-
-while getopts “hitdp:” OPTION
-do
-     case $OPTION in
-         h)
-             usage
-             exit 0
-             ;;
-         i)
-             init
-             ;;
-         t)
-             test
-             ;;
-	 d)
-	     debug
-	     ;;
-	 p)
-	     parse_log $OPTARG
-	     ;;
-         ?)
-             usage
-             exit 0
-             ;;
-     esac
-done
-
-echo "Stop date" $(date)
diff --git a/dbms/benchmark/clickhouse/conf.sh b/dbms/benchmark/clickhouse/conf.sh
deleted file mode 100644
index edb5aaa56da..00000000000
--- a/dbms/benchmark/clickhouse/conf.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-CONF_DIR=/home/kartavyy/benchmark/clickhouse
-expect_file=$CONF_DIR/expect.tcl
-test_file=$CONF_DIR/queries.sql
-etc_init_d_service=/etc/init.d/clickhouse-server-metrika-yandex
diff --git a/dbms/benchmark/clickhouse/expect.tcl b/dbms/benchmark/clickhouse/expect.tcl
deleted file mode 100644
index 764a0f442e8..00000000000
--- a/dbms/benchmark/clickhouse/expect.tcl
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/expect
-
-# Set timeout
-set timeout 600
-
-# Get arguments
-set query [lindex $argv 0]
-
-spawn clickhouse-client --multiline;
-expect ":) "
-send "$query;\r";
-expect ":) "
-send "quit";
diff --git a/dbms/benchmark/killif.sh b/dbms/benchmark/killif.sh
deleted file mode 100755
index a57aa5eb022..00000000000
--- a/dbms/benchmark/killif.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-if [[ $# -ne 0 ]]; then
-    echo "usage: if memory limit is exceeded kill process with biggest memory consumption"
-    exit 1
-fi
-
-while [ 1=1 ];
-do
-    FREE_MEMORY_MB=$(free -m | sed -n '3,3p' | awk '{print $4}')
-
-    PID="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $2}')"
-    NAME="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $11}')"
-    SIZEGB="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $6}')"
-    SIZEGB=$(($SIZEGB/1024/1024))
-
-    echo "Process id ="$PID" Size = "$SIZEGB" GB" "Free Memory = " $FREE_MEMORY_MB" MB"
-    if (( $FREE_MEMORY_MB < 512 ));
-    then echo "Killing the process with biggest memory consumption......"
-	sudo kill -9 $PID 
-	echo "$(date) Killed the process with PID: $PID NAME: $NAME"
-    else
-	echo "SIZE has not yet exceeding"
-    fi
-    
-    sleep 10
-done
\ No newline at end of file
diff --git a/dbms/benchmark/process_log.py b/dbms/benchmark/process_log.py
deleted file mode 100755
index 41c857e451f..00000000000
--- a/dbms/benchmark/process_log.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from optparse import OptionParser
-import argparse
-
-import re
-import sys
-    
-def log_to_rows(filename, pattern_select, time_pattern, pattern_ignore):
-    time_matcher = re.compile(time_pattern)
-    select_matcher = re.compile(pattern_select, re.IGNORECASE);
-    ignore_matcher = re.compile(pattern_ignore)
-
-    f = open(filename, 'r');
-
-    query = ''
-    raw_time = ''
-    for line in f:
-        if ignore_matcher.match(line):
-            continue
-
-        m = select_matcher.search(line)
-        if m :
-            if line != query:
-                query = line
-                sys.stdout.write("\n")
-                raw_time = raw_time + "\n"
-
-        m = time_matcher.search(line)
-        if m:
-            sec = 0
-            minute = 0
-            ms = 0
-            if 'min' in m.groupdict() and m.group('min'):
-                minute = float(m.group('min').replace(',','.'))
-            if 'sec' in m.groupdict() and m.group('sec'):
-                sec = float(m.group('sec').replace(',','.'))
-            if 'ms' in m.groupdict() and m.group('ms'):
-                ms = float(m.group('ms').replace(',', '.'))
-
-            sys.stdout.write( str(minute*60 + sec + ms/1000.)  + " " )    
-            raw_time = raw_time + " | " + m.group('time')
-
-    print
-    print " =======raw time====== \n" + raw_time
-
-
-def process_log(filename, pattern_select, time_pattern, pattern_ignore, error_pattern):
-    time_matcher = re.compile(time_pattern)
-    select_matcher = re.compile(pattern_select, re.IGNORECASE);
-    ignore_matcher = re.compile(pattern_ignore)
-    error_matcher = re.compile(error_pattern, re.IGNORECASE)
-
-    f = open(filename, 'r');
-
-    query = ''
-    for line in f:
-        if error_matcher.match(line):
-            print line
-            continue
-
-        if ignore_matcher.match(line):
-            continue
-
-        m = select_matcher.search(line)
-        if m :
-            if line != query:
-                sys.stdout.flush()
-                query = line
-                print "\n\n"
-                print query 
-
-        m = time_matcher.search(line)
-        if m:
-            sys.stdout.write(m.group('time') + " " )
-
-def main():
-    parser = argparse.ArgumentParser(description="Process log files form different databases")
-    parser.add_argument('log_file', metavar = 'log_file', help = 'database log file')
-    parser.add_argument('db_name', metavar = 'db_name', help = ' database name one of clickhouse, vertica, infinidb, monetdb, infobright, hive (... more later)')
-    args = parser.parse_args()
-
-    log_file = args.log_file
-    db_name = args.db_name
-
-    time_pattern = ''
-    select_pattern = r'query: select '
-    ignore_pattern = r'#'
-    error_pattern = r'error .*'
-    if db_name == 'clickhouse':
-        time_pattern = r'(?P<time>(?P<sec>\d+.\d{3}) sec\.)'
-        select_pattern = r'query\: select '
-        ignore_pattern = r':\).*'
-    elif db_name == 'vertica' :
-        time_pattern = r'(?P<time>(?P<ms>\d+.\d+) ms\.)'
-        select_pattern = r'select '
-        ignore_pattern = r'(.*dbadmin=>|query:|.*Timing is on\.).*'            
-    elif db_name == 'infinidb' :
-        time_pattern = r'(?P<time>(?:(?P<min>\d+) min )?(?P<sec>\d+.\d+) sec)'
-        ignore_pattern = r'Query OK, 0 rows affected \(0\.00 sec\)'
-    elif db_name == 'monetdb' :
-        time_pattern = r'tuples? \((?P<time>(?:(?P<min>\d+)m )?(?:(?P<sec>\d+.?\d+)s)?(?:(?P<ms>\d+.\d+)ms)?)\)'
-    elif db_name == 'infobright' :
-        time_pattern = r'(?P<time>(?:(?P<min>\d+) min ){0,1}(?P<sec>\d+.\d+) sec)'
-    elif db_name == 'hive':
-        time_pattern = r'Time taken\: (?P<time>(?:(?P<sec>\d+.?\d+) seconds))'
-        error_pattern = r'failed\: .*'
-    elif db_name == 'mysql':
-        time_pattern = r'(?P<time>(?:(?P<min>\d+) min )?(?P<sec>\d+.\d+) sec)'
-    else:
-        sys.exit("unknown db_name")
-    
-    process_log(log_file, select_pattern, time_pattern, ignore_pattern, error_pattern )
-    log_to_rows(log_file, select_pattern, time_pattern, ignore_pattern )
-
-main()