From b2057159c1d12fb016b7918b7a00b11ee73967fc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 26 May 2020 17:24:20 +0300 Subject: [PATCH] Experiment on parsing backslash escapes in a more convenient way #10922 --- src/IO/ReadHelpers.cpp | 16 +++- src/IO/tests/CMakeLists.txt | 3 - src/IO/tests/mempbrk.cpp | 90 ------------------- ...escape_sequences_php_mysql_style.reference | 3 + ...01284_escape_sequences_php_mysql_style.sql | 6 ++ 5 files changed, 22 insertions(+), 96 deletions(-) delete mode 100644 src/IO/tests/mempbrk.cpp create mode 100644 tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.reference create mode 100644 tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.sql diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index dccb413af2c..65c3cfab55a 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -283,7 +283,9 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) if (buf.eof()) throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); - if (*buf.position() == 'x') + char char_after_backslash = *buf.position(); + + if (char_after_backslash == 'x') { ++buf.position(); /// escape sequence of the form \xAA @@ -291,7 +293,7 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) readPODBinary(hex_code, buf); s.push_back(unhex2(hex_code)); } - else if (*buf.position() == 'N') + else if (char_after_backslash == 'N') { /// Support for NULLs: \N sequence must be parsed as empty string. ++buf.position(); @@ -299,7 +301,15 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) else { /// The usual escape sequence of a single character. - s.push_back(parseEscapeSequence(*buf.position())); + char decoded_char = parseEscapeSequence(char_after_backslash); + + /// For convenience using LIKE and regular expressions, + /// we leave backslash when user write something like 'Hello 100\%': + /// it is parsed like Hello 100\% instead of Hello 100% + if (decoded_char != '\\' && !isControlASCII(decoded_char)) + s.push_back('\\'); + + s.push_back(decoded_char); ++buf.position(); } } diff --git a/src/IO/tests/CMakeLists.txt b/src/IO/tests/CMakeLists.txt index 2767ce6e271..dfbbfa77853 100644 --- a/src/IO/tests/CMakeLists.txt +++ b/src/IO/tests/CMakeLists.txt @@ -37,9 +37,6 @@ target_link_libraries (parse_int_perf2 PRIVATE clickhouse_common_io) add_executable (read_write_int read_write_int.cpp) target_link_libraries (read_write_int PRIVATE clickhouse_common_io) -add_executable (mempbrk mempbrk.cpp) -target_link_libraries (mempbrk PRIVATE clickhouse_common_io) - add_executable (o_direct_and_dirty_pages o_direct_and_dirty_pages.cpp) target_link_libraries (o_direct_and_dirty_pages PRIVATE clickhouse_common_io) diff --git a/src/IO/tests/mempbrk.cpp b/src/IO/tests/mempbrk.cpp deleted file mode 100644 index 55097d989af..00000000000 --- a/src/IO/tests/mempbrk.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; -} -} - - -namespace test -{ -static void readEscapedString(DB::String & s, DB::ReadBuffer & buf) - { - s = ""; - while (!buf.eof()) - { - const char * next_pos = find_first_symbols<'\b', '\f', '\n', '\r', '\t', '\0', '\\'>(buf.position(), buf.buffer().end()); - - s.append(buf.position(), next_pos - buf.position()); - buf.position() += next_pos - buf.position(); - - if (!buf.hasPendingData()) - continue; - - if (*buf.position() == '\t' || *buf.position() == '\n') - return; - - if (*buf.position() == '\\') - { - ++buf.position(); - if (buf.eof()) - throw DB::Exception("Cannot parse escape sequence", DB::ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); - s += DB::parseEscapeSequence(*buf.position()); - ++buf.position(); - } - } - } -} - - -int main(int, char **) -{ - try - { - DB::ReadBufferFromFileDescriptor in(STDIN_FILENO); -// DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO); - std::string s; - size_t rows = 0; - - Stopwatch watch; - - while (!in.eof()) - { - test::readEscapedString(s, in); - in.ignore(); - - ++rows; - -/* DB::writeEscapedString(s, out); - DB::writeChar('\n', out);*/ - } - - watch.stop(); - std::cerr << std::fixed << std::setprecision(2) - << "Read " << rows << " rows (" << in.count() / 1000000.0 << " MB) in " << watch.elapsedSeconds() << " sec., " - << rows / watch.elapsedSeconds() << " rows/sec. (" << in.count() / watch.elapsedSeconds() / 1000000 << " MB/s.)" - << std::endl; - } - catch (const DB::Exception & e) - { - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; - } - - return 0; -} diff --git a/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.reference b/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.reference new file mode 100644 index 00000000000..19a5fc680ff --- /dev/null +++ b/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.reference @@ -0,0 +1,3 @@ +a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e +1 0 1 1 +1 1 0 0 0 diff --git a/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.sql b/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.sql new file mode 100644 index 00000000000..36ad7b4506c --- /dev/null +++ b/tests/queries/0_stateless/01284_escape_sequences_php_mysql_style.sql @@ -0,0 +1,6 @@ +SELECT 'a\_\c\l\i\c\k\h\o\u\s\e', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'; +select 'aXb' like 'a_b', 'aXb' like 'a\_b', 'a_b' like 'a\_b', 'a_b' like 'a\\_b'; +SELECT match('Hello', '\w+'), match('Hello', '\\w+'), match('Hello', '\\\w+'), match('Hello', '\w\+'), match('Hello', 'w+'); + +SELECT match('Hello', '\He\l\l\o'); -- { serverError 427 } +SELECT match('Hello', '\H\e\l\l\o'); -- { serverError 427 }