Experiment on parsing backslash escapes in a more convenient way #10922

This commit is contained in:
Alexey Milovidov 2020-05-26 17:24:20 +03:00
parent c858f4d89c
commit b2057159c1
5 changed files with 22 additions and 96 deletions

View File

@ -283,7 +283,9 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
if (buf.eof())
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
if (*buf.position() == 'x')
char char_after_backslash = *buf.position();
if (char_after_backslash == 'x')
{
++buf.position();
/// escape sequence of the form \xAA
@ -291,7 +293,7 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
readPODBinary(hex_code, buf);
s.push_back(unhex2(hex_code));
}
else if (*buf.position() == 'N')
else if (char_after_backslash == 'N')
{
/// Support for NULLs: \N sequence must be parsed as empty string.
++buf.position();
@ -299,7 +301,15 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
else
{
/// The usual escape sequence of a single character.
s.push_back(parseEscapeSequence(*buf.position()));
char decoded_char = parseEscapeSequence(char_after_backslash);
/// For convenience using LIKE and regular expressions,
/// we leave backslash when user write something like 'Hello 100\%':
/// it is parsed like Hello 100\% instead of Hello 100%
if (decoded_char != '\\' && !isControlASCII(decoded_char))
s.push_back('\\');
s.push_back(decoded_char);
++buf.position();
}
}

View File

@ -37,9 +37,6 @@ target_link_libraries (parse_int_perf2 PRIVATE clickhouse_common_io)
add_executable (read_write_int read_write_int.cpp)
target_link_libraries (read_write_int PRIVATE clickhouse_common_io)
add_executable (mempbrk mempbrk.cpp)
target_link_libraries (mempbrk PRIVATE clickhouse_common_io)
add_executable (o_direct_and_dirty_pages o_direct_and_dirty_pages.cpp)
target_link_libraries (o_direct_and_dirty_pages PRIVATE clickhouse_common_io)

View File

@ -1,90 +0,0 @@
#include <string>
#include <iostream>
#include <iomanip>
#include <Common/Stopwatch.h>
#include <Core/Types.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <common/find_symbols.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
}
}
namespace test
{
static void readEscapedString(DB::String & s, DB::ReadBuffer & buf)
{
s = "";
while (!buf.eof())
{
const char * next_pos = find_first_symbols<'\b', '\f', '\n', '\r', '\t', '\0', '\\'>(buf.position(), buf.buffer().end());
s.append(buf.position(), next_pos - buf.position());
buf.position() += next_pos - buf.position();
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\t' || *buf.position() == '\n')
return;
if (*buf.position() == '\\')
{
++buf.position();
if (buf.eof())
throw DB::Exception("Cannot parse escape sequence", DB::ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
s += DB::parseEscapeSequence(*buf.position());
++buf.position();
}
}
}
}
int main(int, char **)
{
try
{
DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
// DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO);
std::string s;
size_t rows = 0;
Stopwatch watch;
while (!in.eof())
{
test::readEscapedString(s, in);
in.ignore();
++rows;
/* DB::writeEscapedString(s, out);
DB::writeChar('\n', out);*/
}
watch.stop();
std::cerr << std::fixed << std::setprecision(2)
<< "Read " << rows << " rows (" << in.count() / 1000000.0 << " MB) in " << watch.elapsedSeconds() << " sec., "
<< rows / watch.elapsedSeconds() << " rows/sec. (" << in.count() / watch.elapsedSeconds() / 1000000 << " MB/s.)"
<< std::endl;
}
catch (const DB::Exception & e)
{
std::cerr << e.what() << ", " << e.displayText() << std::endl;
return 1;
}
return 0;
}

View File

@ -0,0 +1,3 @@
a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e
1 0 1 1
1 1 0 0 0

View File

@ -0,0 +1,6 @@
SELECT 'a\_\c\l\i\c\k\h\o\u\s\e', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e';
select 'aXb' like 'a_b', 'aXb' like 'a\_b', 'a_b' like 'a\_b', 'a_b' like 'a\\_b';
SELECT match('Hello', '\w+'), match('Hello', '\\w+'), match('Hello', '\\\w+'), match('Hello', '\w\+'), match('Hello', 'w+');
SELECT match('Hello', '\He\l\l\o'); -- { serverError 427 }
SELECT match('Hello', '\H\e\l\l\o'); -- { serverError 427 }