diff --git a/dbms/programs/odbc-bridge/MainHandler.cpp b/dbms/programs/odbc-bridge/MainHandler.cpp
index 2aebdda3b03..d95f1386c7b 100644
--- a/dbms/programs/odbc-bridge/MainHandler.cpp
+++ b/dbms/programs/odbc-bridge/MainHandler.cpp
@@ -37,7 +37,8 @@ ODBCHandler::PoolPtr ODBCHandler::getPool(const std::string & connection_str)
     std::lock_guard lock(mutex);
     if (!pool_map->count(connection_str))
     {
-        pool_map->emplace(connection_str, createAndCheckResizePocoSessionPool([connection_str] {
+        pool_map->emplace(connection_str, createAndCheckResizePocoSessionPool([connection_str]
+        {
             return std::make_shared<Poco::Data::SessionPool>("ODBC", validateODBCConnectionString(connection_str));
         }));
     }
diff --git a/dbms/src/Core/Defines.h b/dbms/src/Core/Defines.h
index 96c3fa2c57f..0a3b384797d 100644
--- a/dbms/src/Core/Defines.h
+++ b/dbms/src/Core/Defines.h
@@ -117,3 +117,9 @@
     #define NO_SANITIZE_UNDEFINED
     #define NO_SANITIZE_ADDRESS
 #endif
+
+#if defined __GNUC__ && !defined __clang__
+    #define OPTIMIZE(x) __attribute__((__optimize__(x)))
+#else
+    #define OPTIMIZE(x)
+#endif
diff --git a/dbms/src/Formats/CSVRowInputStream.cpp b/dbms/src/Formats/CSVRowInputStream.cpp
index ac6db183318..1c9f32095a0 100644
--- a/dbms/src/Formats/CSVRowInputStream.cpp
+++ b/dbms/src/Formats/CSVRowInputStream.cpp
@@ -1,3 +1,5 @@
+#include <Core/Defines.h>
+
 #include <IO/ReadHelpers.h>
 #include <IO/Operators.h>
 
@@ -189,7 +191,13 @@ String CSVRowInputStream::getDiagnosticInfo()
 }
 
 
-bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
+/** gcc-7 generates wrong code with optimization level greater than 1.
+  * See tests: dbms/src/IO/tests/write_int.cpp
+  *  and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
+  * This is compiler bug. The bug does not present in gcc-8 and clang-8.
+  * Nevertheless, we don't need high optimization of this function.
+  */
+bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
     WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
 {
     const char delimiter = format_settings.csv.delimiter;
diff --git a/dbms/src/Formats/TabSeparatedRowInputStream.cpp b/dbms/src/Formats/TabSeparatedRowInputStream.cpp
index adef5720550..f63461df3c4 100644
--- a/dbms/src/Formats/TabSeparatedRowInputStream.cpp
+++ b/dbms/src/Formats/TabSeparatedRowInputStream.cpp
@@ -1,3 +1,5 @@
+#include <Core/Defines.h>
+
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
@@ -162,8 +164,14 @@ String TabSeparatedRowInputStream::getDiagnosticInfo()
 }
 
 
-bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
-    WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
+/** gcc-7 generates wrong code with optimization level greater than 1.
+  * See tests: dbms/src/IO/tests/write_int.cpp
+  *  and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
+  * This is compiler bug. The bug does not present in gcc-8 and clang-8.
+  * Nevertheless, we don't need high optimization of this function.
+  */
+bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
+    MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
 {
     size_t size = data_types.size();
     for (size_t i = 0; i < size; ++i)
diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt
index 73497def442..29bb98c34ad 100644
--- a/dbms/src/IO/tests/CMakeLists.txt
+++ b/dbms/src/IO/tests/CMakeLists.txt
@@ -51,7 +51,7 @@ add_executable (hashing_read_buffer hashing_read_buffer.cpp)
 target_link_libraries (hashing_read_buffer PRIVATE clickhouse_common_io)
 add_check (hashing_read_buffer)
 
-add_executable (io_operators operators.cpp)
+add_executable (io_operators io_operators.cpp)
 target_link_libraries (io_operators PRIVATE clickhouse_common_io)
 
 if (OS_LINUX)
diff --git a/dbms/src/IO/tests/operators.cpp b/dbms/src/IO/tests/io_operators.cpp
similarity index 87%
rename from dbms/src/IO/tests/operators.cpp
rename to dbms/src/IO/tests/io_operators.cpp
index 0944d17532e..fd84f382c24 100644
--- a/dbms/src/IO/tests/operators.cpp
+++ b/dbms/src/IO/tests/io_operators.cpp
@@ -31,5 +31,11 @@ int main(int, char **)
         std::cerr << hello << '\n';
     }
 
+    {
+        DB::WriteBufferFromFileDescriptor buf(STDOUT_FILENO);
+        size_t x = 11;
+        buf << "Column " << x << ", \n";
+    }
+
     return 0;
 }
diff --git a/dbms/src/IO/tests/write_int.cpp b/dbms/src/IO/tests/write_int.cpp
new file mode 100644
index 00000000000..18396542485
--- /dev/null
+++ b/dbms/src/IO/tests/write_int.cpp
@@ -0,0 +1,28 @@
+#include <IO/WriteHelpers.h>
+#include <IO/WriteBufferFromFileDescriptor.h>
+
+
+/** gcc-7 generates wrong code with -O1 -finline-small-functions -ftree-vrp
+  * This is compiler bug. The issue does not exist in gcc-8 or clang-8.
+  */
+
+
+using namespace DB;
+
+
+void NO_INLINE write(WriteBuffer & out, size_t size)
+{
+    for (size_t i = 0; i < size; ++i)
+    {
+        writeIntText(i, out);
+        writeChar(' ', out);
+    }
+}
+
+
+int main(int, char **)
+{
+    WriteBufferFromFileDescriptor out(STDOUT_FILENO);
+    write(out, 80);
+    return 0;
+}
diff --git a/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.reference b/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.reference
new file mode 100644
index 00000000000..3d5c2ac1055
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.reference
@@ -0,0 +1 @@
+Column 11,  name: c11, type: UInt8, ERROR: text "a" is not like UInt8
diff --git a/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh b/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
new file mode 100755
index 00000000000..6aa875cd2e2
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# set -x
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. $CUR_DIR/../shell_config.sh
+
+echo -ne '0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\ta' | $CLICKHOUSE_LOCAL --structure 'c0 UInt8, c1 UInt8, c2 UInt8, c3 UInt8, c4 UInt8, c5 UInt8, c6 UInt8, c7 UInt8, c8 UInt8, c9 UInt8, c10 UInt8, c11 UInt8' --input-format TSV --query 'SELECT * FROM table' 2>&1 | grep -F 'Column 11'
diff --git a/libs/libcommon/include/common/itoa.h b/libs/libcommon/include/common/itoa.h
index 18764fc5e98..1811ce0080c 100644
--- a/libs/libcommon/include/common/itoa.h
+++ b/libs/libcommon/include/common/itoa.h
@@ -31,29 +31,11 @@
 #include "likely.h"
 #include "unaligned.h"
 
+using int128_t = __int128;
 using uint128_t = unsigned __int128;
 
 namespace impl
 {
-// Using a lookup table to convert binary numbers from 0 to 99
-// into ascii characters as described by Andrei Alexandrescu in
-// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/
-
-static const char digits[201] = "00010203040506070809"
-                                "10111213141516171819"
-                                "20212223242526272829"
-                                "30313233343536373839"
-                                "40414243444546474849"
-                                "50515253545556575859"
-                                "60616263646566676869"
-                                "70717273747576777879"
-                                "80818283848586878889"
-                                "90919293949596979899";
-
-static inline uint16_t const & dd(uint8_t u)
-{
-    return reinterpret_cast<uint16_t const *>(digits)[u];
-}
 
 template <typename T>
 static constexpr T pow10(size_t x)
@@ -77,64 +59,111 @@ static constexpr T pow10(size_t x)
 // and compensating for rounding errors. The algorithm described below
 // was invented by Terje Mathisen, Norway, and not published elsewhere."
 
-template <typename UInt, bool A, UInt M, unsigned S>
-struct MulInv
+/// Division by constant is performed by:
+/// 1. Adding 1 if needed;
+/// 2. Multiplying by another constant;
+/// 3. Shifting right by another constant.
+template <typename UInt, bool add_, UInt multiplier_, unsigned shift_>
+struct Division
 {
-    using type = UInt;
-    static constexpr bool a{A};
-    static constexpr UInt m{M};
-    static constexpr unsigned s{S};
+    static constexpr bool add{add_};
+    static constexpr UInt multiplier{multiplier_};
+    static constexpr unsigned shift{shift_};
 };
-template <int, int, class...>
-struct UT;
-template <int N, class T, class... Ts>
-struct UT<N, N, T, Ts...>
-{
-    using U = T;
-};
-template <int N, int M, class T, class... Ts>
-struct UT<N, M, T, Ts...>
-{
-    using U = typename UT<N, 2 * M, Ts...>::U;
-};
-template <int N>
-using MI = typename UT<
-    N,
-    1,
-    MulInv<uint8_t, 0, 205U, 11>,
-    MulInv<uint16_t, 1, 41943U, 22>,
-    MulInv<uint32_t, 0, 3518437209U, 45>,
-    MulInv<uint64_t, 0, 12379400392853802749U, 90>,
-    MulInv<uint128_t, 0, 0, 0>>::U;
-template <int N>
-using U = typename MI<N>::type;
 
-// struct QR holds the result of dividing an unsigned N-byte variable
-// by 10^N resulting in
-template <size_t N>
-struct QR
+/// Select a type with appropriate number of bytes from the list of types.
+/// First parameter is the number of bytes requested. Then goes a list of types with 1, 2, 4, ... number of bytes.
+/// Example: SelectType<4, uint8_t, uint16_t, uint32_t, uint64_t> will select uint32_t.
+template <size_t N, typename T, typename... Ts>
+struct SelectType
 {
-    U<N> q; // quotient with fewer than 2*N decimal digits
-    U<N / 2> r; // remainder with at most N decimal digits
+    using Result = typename SelectType<N / 2, Ts...>::Result;
 };
-template <size_t N>
-QR<N> static inline split(U<N> u)
+
+template <typename T, typename... Ts>
+struct SelectType<1, T, Ts...>
 {
-    constexpr MI<N> mi{};
-    U<N> q = (mi.m * (U<2 * N>(u) + mi.a)) >> mi.s;
-    return {q, U<N / 2>(u - q * pow10<U<N / 2>>(N))};
+    using Result = T;
+};
+
+
+/// Division by 10^N where N is the size of the type.
+template <size_t N>
+using DivisionBy10PowN = typename SelectType
+<
+    N,
+    Division<uint8_t, 0, 205U, 11>,                           /// divide by 10
+    Division<uint16_t, 1, 41943U, 22>,                        /// divide by 100
+    Division<uint32_t, 0, 3518437209U, 45>,                   /// divide by 10000
+    Division<uint64_t, 0, 12379400392853802749ULL, 90>        /// divide by 100000000
+>::Result;
+
+template <size_t N>
+using UnsignedOfSize = typename SelectType
+<
+    N,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+    uint128_t
+>::Result;
+
+/// Holds the result of dividing an unsigned N-byte variable by 10^N resulting in
+template <size_t N>
+struct QuotientAndRemainder
+{
+    UnsignedOfSize<N> quotient; // quotient with fewer than 2*N decimal digits
+    UnsignedOfSize<N / 2> remainder; // remainder with at most N decimal digits
+};
+
+template <size_t N>
+QuotientAndRemainder<N> static inline split(UnsignedOfSize<N> value)
+{
+    constexpr DivisionBy10PowN<N> division;
+
+    UnsignedOfSize<N> quotient = (division.multiplier * (UnsignedOfSize<2 * N>(value) + division.add)) >> division.shift;
+    UnsignedOfSize<N / 2> remainder = value - quotient * pow10<UnsignedOfSize<N / 2>>(N);
+
+    return {quotient, remainder};
 }
 
-template <typename T>
-static inline char * out(char * p, T && obj)
+
+static inline char * outDigit(char * p, uint8_t value)
 {
-    memcpy(p, reinterpret_cast<const void *>(&obj), sizeof(T));
-    p += sizeof(T);
+    *p = '0' + value;
+    ++p;
     return p;
 }
 
-struct convert
+// Using a lookup table to convert binary numbers from 0 to 99
+// into ascii characters as described by Andrei Alexandrescu in
+// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/
+
+static const char digits[201] = "00010203040506070809"
+                                "10111213141516171819"
+                                "20212223242526272829"
+                                "30313233343536373839"
+                                "40414243444546474849"
+                                "50515253545556575859"
+                                "60616263646566676869"
+                                "70717273747576777879"
+                                "80818283848586878889"
+                                "90919293949596979899";
+
+static inline char * outTwoDigits(char * p, uint8_t value)
 {
+    memcpy(p, &digits[value * 2], 2);
+    p += 2;
+    return p;
+}
+
+
+namespace convert
+{
+    template <typename UInt, size_t N = sizeof(UInt)> static char * head(char * p, UInt u);
+    template <typename UInt, size_t N = sizeof(UInt)> static char * tail(char * p, UInt u);
+
     //===----------------------------------------------------------===//
     //     head: find most significant digit, skip leading zeros
     //===----------------------------------------------------------===//
@@ -142,35 +171,51 @@ struct convert
     // "x" contains quotient and remainder after division by 10^N
     // quotient is less than 10^N
     template <size_t N>
-    static inline char * head(char * p, QR<N> x)
+    static inline char * head(char * p, QuotientAndRemainder<N> x)
     {
-        return tail(head(p, U<N / 2>(x.q)), x.r);
+        p = head(p, UnsignedOfSize<N / 2>(x.quotient));
+        p = tail(p, x.remainder);
+        return p;
     }
 
     // "u" is less than 10^2*N
-    template <typename UInt, size_t N = sizeof(UInt)>
+    template <typename UInt, size_t N>
     static inline char * head(char * p, UInt u)
     {
-        return (u < pow10<U<N>>(N) ? (head(p, U<N / 2>(u))) : (head<N>(p, split<N>(u))));
+        return u < pow10<UnsignedOfSize<N>>(N)
+            ? head(p, UnsignedOfSize<N / 2>(u))
+            : head<N>(p, split<N>(u));
     }
 
     // recursion base case, selected when "u" is one byte
-    static inline char * head(char * p, U<1> u) { return (u < 10 ? (out<char>(p, '0' + u)) : (out(p, dd(u)))); }
+    template <>
+    inline char * head<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
+    {
+        return u < 10
+            ? outDigit(p, u)
+            : outTwoDigits(p, u);
+    }
 
     //===----------------------------------------------------------===//
     //     tail: produce all digits including leading zeros
     //===----------------------------------------------------------===//
 
     // recursive step, "u" is less than 10^2*N
-    template <typename UInt, size_t N = sizeof(UInt)>
+    template <typename UInt, size_t N>
     static inline char * tail(char * p, UInt u)
     {
-        QR<N> x = split<N>(u);
-        return tail(tail(p, U<N / 2>(x.q)), x.r);
+        QuotientAndRemainder<N> x = split<N>(u);
+        p = tail(p, UnsignedOfSize<N / 2>(x.quotient));
+        p = tail(p, x.remainder);
+        return p;
     }
 
     // recursion base case, selected when "u" is one byte
-    static inline char * tail(char * p, U<1> u) { return out(p, dd(u)); }
+    template <>
+    inline char * tail<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
+    {
+        return outTwoDigits(p, u);
+    }
 
     //===----------------------------------------------------------===//
     // large values are >= 10^2*N
@@ -178,10 +223,13 @@ struct convert
     //===----------------------------------------------------------===//
 
     template <size_t N>
-    static inline char * large(char * p, QR<N> x)
+    static inline char * large(char * p, QuotientAndRemainder<N> x)
     {
-        QR<N> y = split<N>(x.q);
-        return tail(tail(head(p, U<N / 2>(y.q)), y.r), x.r);
+        QuotientAndRemainder<N> y = split<N>(x.quotient);
+        p = head(p, UnsignedOfSize<N / 2>(y.quotient));
+        p = tail(p, y.remainder);
+        p = tail(p, x.remainder);
+        return p;
     }
 
     //===----------------------------------------------------------===//
@@ -192,20 +240,29 @@ struct convert
     template <typename UInt, size_t N = sizeof(UInt)>
     static inline char * itoa(char * p, UInt u)
     {
-        if (u < pow10<U<N>>(N))
-            return head(p, U<N / 2>(u));
-        QR<N> x = split<N>(u);
-        return (u < pow10<U<N>>(2 * N) ? (head<N>(p, x)) : (large<N>(p, x)));
+        if (u < pow10<UnsignedOfSize<N>>(N))
+            return head(p, UnsignedOfSize<N / 2>(u));
+        QuotientAndRemainder<N> x = split<N>(u);
+
+        return u < pow10<UnsignedOfSize<N>>(2 * N)
+            ? head<N>(p, x)
+            : large<N>(p, x);
     }
 
     // selected when "u" is one byte
-    static inline char * itoa(char * p, U<1> u)
+    template <>
+    inline char * itoa<UnsignedOfSize<1>, 1>(char * p, UnsignedOfSize<1> u)
     {
         if (u < 10)
-            return out<char>(p, '0' + u);
-        if (u < 100)
-            return out(p, dd(u));
-        return out(out<char>(p, '0' + u / 100), dd(u % 100));
+            return outDigit(p, u);
+        else if (u < 100)
+            return outTwoDigits(p, u);
+        else
+        {
+            p = outDigit(p, u / 100);
+            p = outTwoDigits(p, u % 100);
+            return p;
+        }
     }
 
     //===----------------------------------------------------------===//
@@ -213,14 +270,14 @@ struct convert
     //===----------------------------------------------------------===//
 
     // itoa: handle unsigned integral operands (selected by SFINAE)
-    template <typename U, std::enable_if_t<not std::is_signed<U>::value && std::is_integral<U>::value> * = nullptr>
+    template <typename U, std::enable_if_t<!std::is_signed_v<U> && std::is_integral_v<U>> * = nullptr>
     static inline char * itoa(U u, char * p)
     {
         return convert::itoa(p, u);
     }
 
     // itoa: handle signed integral operands (selected by SFINAE)
-    template <typename I, size_t N = sizeof(I), std::enable_if_t<std::is_signed<I>::value && std::is_integral<I>::value> * = nullptr>
+    template <typename I, size_t N = sizeof(I), std::enable_if_t<std::is_signed_v<I> && std::is_integral_v<I>> * = nullptr>
     static inline char * itoa(I i, char * p)
     {
         // Need "mask" to be filled with a copy of the sign bit.
@@ -230,19 +287,19 @@ struct convert
         // Use a conditional expression to be portable,
         // a good optimizing compiler generates an arithmetic right shift
         // and avoids the conditional branch.
-        U<N> mask = i < 0 ? ~U<N>(0) : 0;
-        // Now get the absolute value of "i" and cast to unsigned type U<N>.
+        UnsignedOfSize<N> mask = i < 0 ? ~UnsignedOfSize<N>(0) : 0;
+        // Now get the absolute value of "i" and cast to unsigned type UnsignedOfSize<N>.
         // Cannot use std::abs() because the result is undefined
         // in 2's complement systems for the most-negative value.
         // Want to avoid conditional branch for performance reasons since
         // CPU branch prediction will be ineffective when negative values
         // occur randomly.
-        // Let "u" be "i" cast to unsigned type U<N>.
+        // Let "u" be "i" cast to unsigned type UnsignedOfSize<N>.
         // Subtract "u" from 2*u if "i" is positive or 0 if "i" is negative.
         // This yields the absolute value with the desired type without
         // using a conditional branch and without invoking undefined or
         // implementation defined behavior:
-        U<N> u = ((2 * U<N>(i)) & ~mask) - U<N>(i);
+        UnsignedOfSize<N> u = ((2 * UnsignedOfSize<N>(i)) & ~mask) - UnsignedOfSize<N>(i);
         // Unconditionally store a minus sign when producing digits
         // in a forward direction and increment the pointer only if
         // the value is in fact negative.
@@ -298,12 +355,12 @@ static inline char * writeUIntText(uint128_t x, char * p)
         const auto i = x % 100;
         x /= 100;
         pp -= 2;
-        unalignedStore(pp, dd(i));
+        outTwoDigits(pp, i);
     }
     if (x < 10)
         *p = '0' + x;
     else
-        unalignedStore(p, dd(x));
+        outTwoDigits(p, x);
     return p + len;
 }
 
@@ -313,9 +370,9 @@ static inline char * writeLeadingMinus(char * pos)
     return pos + 1;
 }
 
-static inline char * writeSIntText(__int128 x, char * pos)
+static inline char * writeSIntText(int128_t x, char * pos)
 {
-    static const __int128 min_int128 = __int128(0x8000000000000000ll) << 64;
+    static const int128_t min_int128 = int128_t(0x8000000000000000ll) << 64;
 
     if (unlikely(x == min_int128))
     {
@@ -328,7 +385,7 @@ static inline char * writeSIntText(__int128 x, char * pos)
         x = -x;
         pos = writeLeadingMinus(pos);
     }
-    return writeUIntText(static_cast<unsigned __int128>(x), pos);
+    return writeUIntText(static_cast<uint128_t>(x), pos);
 }
 
 }
@@ -339,12 +396,14 @@ char * itoa(I i, char * p)
     return impl::convert::itoa(i, p);
 }
 
-static inline char * itoa(uint128_t i, char * p)
+template <>
+inline char * itoa<uint128_t>(uint128_t i, char * p)
 {
     return impl::writeUIntText(i, p);
 }
 
-static inline char * itoa(__int128 i, char * p)
+template <>
+inline char * itoa<int128_t>(int128_t i, char * p)
 {
     return impl::writeSIntText(i, p);
 }