2013-01-05 10:07:01 +00:00
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
|
2012-05-08 05:42:05 +00:00
|
|
|
|
#include <sstream>
|
|
|
|
|
|
2012-09-24 05:40:45 +00:00
|
|
|
|
#include <mysqlxx/Manip.h>
|
|
|
|
|
|
|
|
|
|
#include <DB/Core/Defines.h>
|
2010-06-04 18:25:25 +00:00
|
|
|
|
#include <DB/IO/ReadHelpers.h>
|
|
|
|
|
|
2012-05-08 05:42:05 +00:00
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
|
namespace ErrorCodes
|
|
|
|
|
{
|
|
|
|
|
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
|
|
|
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
|
|
|
|
extern const int CANNOT_PARSE_QUOTED_STRING;
|
|
|
|
|
}
|
|
|
|
|
|
2012-09-24 05:40:45 +00:00
|
|
|
|
|
|
|
|
|
static void __attribute__((__noinline__)) throwAtAssertionFailed(const char * s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
std::stringstream message;
|
|
|
|
|
message << "Cannot parse input: expected " << mysqlxx::escape << s;
|
|
|
|
|
|
|
|
|
|
if (buf.eof())
|
|
|
|
|
message << " at end of stream.";
|
|
|
|
|
else
|
|
|
|
|
message << " before: " << mysqlxx::escape << String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position()));
|
|
|
|
|
|
|
|
|
|
throw Exception(message.str(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2015-10-05 14:20:56 +00:00
|
|
|
|
bool checkString(const char * s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
|
{
|
|
|
|
|
for (; *s; ++s)
|
|
|
|
|
{
|
|
|
|
|
if (buf.eof() || *buf.position() != *s)
|
2015-10-05 14:20:56 +00:00
|
|
|
|
return false;
|
2010-06-04 18:25:25 +00:00
|
|
|
|
++buf.position();
|
|
|
|
|
}
|
2015-10-05 14:20:56 +00:00
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void assertString(const char * s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
if (!checkString(s, buf))
|
|
|
|
|
throwAtAssertionFailed(s, buf);
|
2010-06-04 18:25:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-06-03 15:32:06 +00:00
|
|
|
|
void assertChar(char symbol, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
if (buf.eof() || *buf.position() != symbol)
|
|
|
|
|
{
|
|
|
|
|
char err[2] = {symbol, '\0'};
|
|
|
|
|
throwAtAssertionFailed(err, buf);
|
|
|
|
|
}
|
|
|
|
|
++buf.position();
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-27 11:29:40 +00:00
|
|
|
|
void assertEOF(ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
if (!buf.eof())
|
|
|
|
|
throwAtAssertionFailed("eof", buf);
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
|
void readString(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
s = "";
|
|
|
|
|
while (!buf.eof())
|
|
|
|
|
{
|
|
|
|
|
size_t bytes = 0;
|
|
|
|
|
for (; buf.position() + bytes != buf.buffer().end(); ++bytes)
|
|
|
|
|
if (buf.position()[bytes] == '\t' || buf.position()[bytes] == '\n')
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
s.append(buf.position(), bytes);
|
|
|
|
|
buf.position() += bytes;
|
|
|
|
|
|
2015-02-07 23:13:04 +00:00
|
|
|
|
if (buf.hasPendingData())
|
2010-06-04 18:25:25 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-09-08 14:24:25 +00:00
|
|
|
|
void readStringUntilEOF(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
s = "";
|
|
|
|
|
while (!buf.eof())
|
|
|
|
|
{
|
|
|
|
|
size_t bytes = buf.buffer().end() - buf.position();
|
|
|
|
|
|
|
|
|
|
s.append(buf.position(), bytes);
|
|
|
|
|
buf.position() += bytes;
|
|
|
|
|
|
|
|
|
|
if (buf.hasPendingData())
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
2013-01-05 10:07:01 +00:00
|
|
|
|
|
|
|
|
|
/** Позволяет найти в куске памяти следующий символ \t, \n или \\.
|
|
|
|
|
* Функция похожа на strpbrk, но со следующими отличиями:
|
|
|
|
|
* - работает с любыми кусками памяти, в том числе, с нулевыми байтами;
|
|
|
|
|
* - не требует нулевого байта в конце - в функцию передаётся конец данных;
|
|
|
|
|
* - в случае, если не найдено, возвращает указатель на конец, а не NULL.
|
|
|
|
|
*
|
|
|
|
|
* Использует SSE2, что даёт прирост скорости примерно в 1.7 раза (по сравнению с тривиальным циклом)
|
|
|
|
|
* при парсинге типичного tab-separated файла со строками.
|
2015-03-31 20:46:44 +00:00
|
|
|
|
* Можно было бы использовать SSE4.2, но он на момент написания кода поддерживался не на всех наших серверах (сейчас уже поддерживается везде).
|
2013-01-05 10:07:01 +00:00
|
|
|
|
* При парсинге файла с короткими строками, падения производительности нет.
|
|
|
|
|
*/
|
|
|
|
|
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
|
|
|
|
|
{
|
|
|
|
|
static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'};
|
|
|
|
|
static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
|
|
|
|
|
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'};
|
|
|
|
|
|
|
|
|
|
static const __m128i tab = *reinterpret_cast<const __m128i *>(tab_chars);
|
|
|
|
|
static const __m128i lf = *reinterpret_cast<const __m128i *>(lf_chars);
|
|
|
|
|
static const __m128i bs = *reinterpret_cast<const __m128i *>(bs_chars);
|
|
|
|
|
|
|
|
|
|
for (; (reinterpret_cast<ptrdiff_t>(begin) & 0x0F) && begin < end; ++begin)
|
|
|
|
|
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
|
|
|
|
|
return begin;
|
|
|
|
|
|
|
|
|
|
for (; begin + 15 < end; begin += 16)
|
|
|
|
|
{
|
|
|
|
|
__m128i bytes = *reinterpret_cast<const __m128i *>(begin);
|
|
|
|
|
|
|
|
|
|
__m128i eq1 = _mm_cmpeq_epi8(bytes, tab);
|
|
|
|
|
__m128i eq2 = _mm_cmpeq_epi8(bytes, lf);
|
|
|
|
|
__m128i eq3 = _mm_cmpeq_epi8(bytes, bs);
|
|
|
|
|
|
|
|
|
|
eq1 = _mm_or_si128(eq1, eq2);
|
|
|
|
|
eq1 = _mm_or_si128(eq1, eq3);
|
|
|
|
|
|
|
|
|
|
UInt16 bit_mask = _mm_movemask_epi8(eq1);
|
|
|
|
|
|
|
|
|
|
if (bit_mask)
|
|
|
|
|
return begin + __builtin_ctz(bit_mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (; begin < end; ++begin)
|
|
|
|
|
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
|
|
|
|
|
return begin;
|
|
|
|
|
|
|
|
|
|
return end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2015-11-25 03:11:17 +00:00
|
|
|
|
/** Распарсить escape-последовательность, которая может быть простой (один символ после бэкслеша) или более сложной (несколько символов).
|
|
|
|
|
* Предполагается, что курсор расположен на символе \
|
|
|
|
|
*/
|
|
|
|
|
static void parseComplexEscapeSequence(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
++buf.position();
|
|
|
|
|
if (buf.eof())
|
|
|
|
|
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
|
|
|
|
|
|
|
|
|
if (*buf.position() == 'x')
|
|
|
|
|
{
|
|
|
|
|
++buf.position();
|
|
|
|
|
/// escape-последовательность вида \xAA
|
|
|
|
|
UInt8 c1;
|
|
|
|
|
UInt8 c2;
|
|
|
|
|
readPODBinary(c1, buf);
|
|
|
|
|
readPODBinary(c2, buf);
|
|
|
|
|
s += static_cast<char>(unhex(c1) * 16 + unhex(c2));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/// Обычная escape-последовательность из одного символа.
|
|
|
|
|
s += parseEscapeSequence(*buf.position());
|
|
|
|
|
++buf.position();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2013-01-05 10:07:01 +00:00
|
|
|
|
void readEscapedString(DB::String & s, DB::ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
|
{
|
|
|
|
|
s = "";
|
|
|
|
|
while (!buf.eof())
|
|
|
|
|
{
|
2013-01-05 10:07:01 +00:00
|
|
|
|
const char * next_pos = find_first_tab_lf_or_backslash(buf.position(), buf.buffer().end());
|
2010-06-04 18:25:25 +00:00
|
|
|
|
|
2013-01-05 10:07:01 +00:00
|
|
|
|
s.append(buf.position(), next_pos - buf.position());
|
|
|
|
|
buf.position() += next_pos - buf.position();
|
2010-06-04 18:25:25 +00:00
|
|
|
|
|
2015-02-07 23:13:04 +00:00
|
|
|
|
if (!buf.hasPendingData())
|
2011-12-26 02:17:33 +00:00
|
|
|
|
continue;
|
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
|
if (*buf.position() == '\t' || *buf.position() == '\n')
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
2015-11-25 03:11:17 +00:00
|
|
|
|
parseComplexEscapeSequence(s, buf);
|
2010-06-04 18:25:25 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
|
2013-01-05 10:07:01 +00:00
|
|
|
|
template <char quote>
|
|
|
|
|
static inline const char * find_first_quote_or_backslash(const char * begin, const char * end)
|
|
|
|
|
{
|
|
|
|
|
static const char quote_chars[16] = {quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote};
|
|
|
|
|
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
|
|
|
|
|
|
|
|
|
|
static const __m128i quote_128 = *reinterpret_cast<const __m128i *>(quote_chars);
|
|
|
|
|
static const __m128i bs_128 = *reinterpret_cast<const __m128i *>(bs_chars);
|
|
|
|
|
|
|
|
|
|
for (; (reinterpret_cast<ptrdiff_t>(begin) & 0x0F) && begin < end; ++begin)
|
|
|
|
|
if (*begin == quote || *begin == '\\')
|
|
|
|
|
return begin;
|
|
|
|
|
|
|
|
|
|
for (; begin + 15 < end; begin += 16)
|
|
|
|
|
{
|
|
|
|
|
__m128i bytes = *reinterpret_cast<const __m128i *>(begin);
|
|
|
|
|
|
|
|
|
|
__m128i eq1 = _mm_cmpeq_epi8(bytes, quote_128);
|
|
|
|
|
__m128i eq2 = _mm_cmpeq_epi8(bytes, bs_128);
|
|
|
|
|
|
|
|
|
|
eq1 = _mm_or_si128(eq1, eq2);
|
|
|
|
|
|
|
|
|
|
UInt16 bit_mask = _mm_movemask_epi8(eq1);
|
|
|
|
|
|
|
|
|
|
if (bit_mask)
|
|
|
|
|
return begin + __builtin_ctz(bit_mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (; begin < end; ++begin)
|
|
|
|
|
if (*begin == quote || *begin == '\\')
|
|
|
|
|
return begin;
|
|
|
|
|
|
|
|
|
|
return end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
template <char quote>
|
|
|
|
|
static void readAnyQuotedString(String & s, ReadBuffer & buf)
|
2010-06-04 18:25:25 +00:00
|
|
|
|
{
|
|
|
|
|
s = "";
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
if (buf.eof() || *buf.position() != quote)
|
|
|
|
|
throw Exception("Cannot parse quoted string: expected opening quote",
|
2010-06-04 18:25:25 +00:00
|
|
|
|
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
|
|
|
|
|
++buf.position();
|
|
|
|
|
|
|
|
|
|
while (!buf.eof())
|
|
|
|
|
{
|
2013-01-05 10:07:01 +00:00
|
|
|
|
const char * next_pos = find_first_quote_or_backslash<quote>(buf.position(), buf.buffer().end());
|
2010-06-04 18:25:25 +00:00
|
|
|
|
|
2013-01-05 10:07:01 +00:00
|
|
|
|
s.append(buf.position(), next_pos - buf.position());
|
|
|
|
|
buf.position() += next_pos - buf.position();
|
2015-02-07 23:13:04 +00:00
|
|
|
|
|
|
|
|
|
if (!buf.hasPendingData())
|
2011-12-26 02:17:33 +00:00
|
|
|
|
continue;
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
if (*buf.position() == quote)
|
2010-06-04 18:25:25 +00:00
|
|
|
|
{
|
|
|
|
|
++buf.position();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (*buf.position() == '\\')
|
2015-11-25 03:11:17 +00:00
|
|
|
|
parseComplexEscapeSequence(s, buf);
|
2010-06-04 18:25:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
throw Exception("Cannot parse quoted string: expected closing quote",
|
2010-06-04 18:25:25 +00:00
|
|
|
|
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-15 18:54:18 +00:00
|
|
|
|
|
|
|
|
|
void readQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
readAnyQuotedString<'\''>(s, buf);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void readDoubleQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
readAnyQuotedString<'"'>(s, buf);
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-01 17:57:37 +00:00
|
|
|
|
void readBackQuotedString(String & s, ReadBuffer & buf)
|
|
|
|
|
{
|
|
|
|
|
readAnyQuotedString<'`'>(s, buf);
|
|
|
|
|
}
|
|
|
|
|
|
2012-05-08 05:42:05 +00:00
|
|
|
|
|
2015-04-01 02:55:52 +00:00
|
|
|
|
void readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf)
|
|
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
|
static constexpr auto DATE_TIME_BROKEN_DOWN_LENGTH = 19;
|
|
|
|
|
static constexpr auto UNIX_TIMESTAMP_MAX_LENGTH = 10;
|
2015-04-01 02:55:52 +00:00
|
|
|
|
|
2015-10-21 19:04:02 +00:00
|
|
|
|
char s[DATE_TIME_BROKEN_DOWN_LENGTH];
|
|
|
|
|
char * s_pos = s;
|
|
|
|
|
|
|
|
|
|
/// Кусок, похожий на unix timestamp.
|
|
|
|
|
while (s_pos < s + UNIX_TIMESTAMP_MAX_LENGTH && !buf.eof() && *buf.position() >= '0' && *buf.position() <= '9')
|
2015-04-01 02:55:52 +00:00
|
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
|
*s_pos = *buf.position();
|
|
|
|
|
++s_pos;
|
|
|
|
|
++buf.position();
|
2015-04-01 02:55:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-10-21 19:04:02 +00:00
|
|
|
|
/// 2015-01-01 01:02:03
|
|
|
|
|
if (s_pos == s + 4 && !buf.eof() && (*buf.position() < '0' || *buf.position() > '9'))
|
2015-04-01 02:55:52 +00:00
|
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
|
const size_t remaining_size = DATE_TIME_BROKEN_DOWN_LENGTH - (s_pos - s);
|
|
|
|
|
size_t size = buf.read(s_pos, remaining_size);
|
|
|
|
|
if (remaining_size != size)
|
2015-04-01 02:55:52 +00:00
|
|
|
|
{
|
2015-10-21 19:04:02 +00:00
|
|
|
|
s_pos[size] = 0;
|
2015-04-01 02:55:52 +00:00
|
|
|
|
throw Exception(std::string("Cannot parse datetime ") + s, ErrorCodes::CANNOT_PARSE_DATETIME);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
|
|
|
|
|
UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
|
|
|
|
|
UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
|
|
|
|
|
|
|
|
|
|
UInt8 hour = (s[11] - '0') * 10 + (s[12] - '0');
|
|
|
|
|
UInt8 minute = (s[14] - '0') * 10 + (s[15] - '0');
|
|
|
|
|
UInt8 second = (s[17] - '0') * 10 + (s[18] - '0');
|
|
|
|
|
|
|
|
|
|
if (unlikely(year == 0))
|
|
|
|
|
datetime = 0;
|
|
|
|
|
else
|
|
|
|
|
datetime = DateLUT::instance().makeDateTime(year, month, day, hour, minute, second);
|
|
|
|
|
}
|
|
|
|
|
else
|
2015-10-21 19:04:02 +00:00
|
|
|
|
datetime = parse<time_t>(s, s_pos - s);
|
2015-04-01 02:55:52 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2012-05-08 05:42:05 +00:00
|
|
|
|
void readException(Exception & e, ReadBuffer & buf, const String & additional_message)
|
|
|
|
|
{
|
|
|
|
|
int code = 0;
|
|
|
|
|
String name;
|
|
|
|
|
String message;
|
|
|
|
|
String stack_trace;
|
|
|
|
|
bool has_nested = false;
|
2015-04-01 02:55:52 +00:00
|
|
|
|
|
2012-05-08 05:42:05 +00:00
|
|
|
|
readBinary(code, buf);
|
|
|
|
|
readBinary(name, buf);
|
|
|
|
|
readBinary(message, buf);
|
|
|
|
|
readBinary(stack_trace, buf);
|
|
|
|
|
readBinary(has_nested, buf);
|
|
|
|
|
|
|
|
|
|
std::stringstream message_stream;
|
|
|
|
|
|
|
|
|
|
if (!additional_message.empty())
|
|
|
|
|
message_stream << additional_message << ". ";
|
|
|
|
|
|
|
|
|
|
if (name != "DB::Exception")
|
|
|
|
|
message_stream << name << ". ";
|
|
|
|
|
|
|
|
|
|
message_stream << message
|
|
|
|
|
<< ". Stack trace:\n\n" << stack_trace;
|
|
|
|
|
|
|
|
|
|
if (has_nested)
|
|
|
|
|
{
|
|
|
|
|
Exception nested;
|
|
|
|
|
readException(nested, buf);
|
|
|
|
|
e = Exception(message_stream.str(), nested, code);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
e = Exception(message_stream.str(), code);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void readAndThrowException(ReadBuffer & buf, const String & additional_message)
|
|
|
|
|
{
|
|
|
|
|
Exception e;
|
|
|
|
|
readException(e, buf, additional_message);
|
|
|
|
|
e.rethrow();
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-04 18:25:25 +00:00
|
|
|
|
}
|