From 562ea03ce70c5fe0b44298b18f898e87d9ab44a5 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Mon, 17 Jul 2023 12:54:34 +0300 Subject: [PATCH 1/2] FunctionsURL.h to StringHelpers.h and move out of URL/ --- src/Functions/{URL/FunctionsURL.h => StringHelpers.h} | 8 ++++---- src/Functions/URL/FirstSignificantSubdomainCustomImpl.h | 2 +- src/Functions/URL/basename.cpp | 2 +- src/Functions/URL/fragment.h | 2 +- src/Functions/URL/netloc.cpp | 3 +-- src/Functions/URL/path.cpp | 2 +- src/Functions/URL/path.h | 2 +- src/Functions/URL/pathFull.cpp | 2 +- src/Functions/URL/protocol.h | 3 +-- src/Functions/URL/queryString.h | 2 +- src/Functions/URL/queryStringAndFragment.h | 3 +-- 11 files changed, 14 insertions(+), 17 deletions(-) rename src/Functions/{URL/FunctionsURL.h => StringHelpers.h} (93%) diff --git a/src/Functions/URL/FunctionsURL.h b/src/Functions/StringHelpers.h similarity index 93% rename from src/Functions/URL/FunctionsURL.h rename to src/Functions/StringHelpers.h index 362042e31e1..a0f4d18aa80 100644 --- a/src/Functions/URL/FunctionsURL.h +++ b/src/Functions/StringHelpers.h @@ -7,8 +7,8 @@ namespace DB { -/** URL processing functions. See implementation in separate .cpp files. - * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons. +/** These helpers are used by URL processing functions. See implementation in separate .cpp files. + * All functions do not strictly follow RFC, instead they are maximally simplified for performance reasons. * * Functions for extraction parts of URL. * If URL has nothing like, then empty string is returned. @@ -101,7 +101,7 @@ struct ExtractSubstringImpl static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by URL functions"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function"); } }; @@ -156,7 +156,7 @@ struct CutSubstringImpl static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by URL functions"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported this function"); } }; diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h index 5862265ce7d..93691e35741 100644 --- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include #include #include #include diff --git a/src/Functions/URL/basename.cpp b/src/Functions/URL/basename.cpp index 6992f924ef2..bc747e2595f 100644 --- a/src/Functions/URL/basename.cpp +++ b/src/Functions/URL/basename.cpp @@ -1,7 +1,7 @@ #include #include +#include #include -#include "FunctionsURL.h" namespace DB { diff --git a/src/Functions/URL/fragment.h b/src/Functions/URL/fragment.h index 6c11d0fbb34..0414c4f64a2 100644 --- a/src/Functions/URL/fragment.h +++ b/src/Functions/URL/fragment.h @@ -1,7 +1,7 @@ #pragma once -#include "FunctionsURL.h" #include +#include namespace DB { diff --git a/src/Functions/URL/netloc.cpp b/src/Functions/URL/netloc.cpp index bc34e34a40d..bf3e8a471ef 100644 --- a/src/Functions/URL/netloc.cpp +++ b/src/Functions/URL/netloc.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include namespace DB @@ -154,4 +154,3 @@ REGISTER_FUNCTION(Netloc) } } - diff --git a/src/Functions/URL/path.cpp b/src/Functions/URL/path.cpp index ccc7dedb724..8d609f43191 100644 --- a/src/Functions/URL/path.cpp +++ b/src/Functions/URL/path.cpp @@ -1,6 +1,6 @@ #include #include -#include "FunctionsURL.h" +#include #include "path.h" #include diff --git a/src/Functions/URL/path.h b/src/Functions/URL/path.h index 89244659088..a0dd5eea2d3 100644 --- a/src/Functions/URL/path.h +++ b/src/Functions/URL/path.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB diff --git a/src/Functions/URL/pathFull.cpp b/src/Functions/URL/pathFull.cpp index 002770e812a..9aacee21fed 100644 --- a/src/Functions/URL/pathFull.cpp +++ b/src/Functions/URL/pathFull.cpp @@ -1,6 +1,6 @@ #include #include -#include "FunctionsURL.h" +#include #include "path.h" #include diff --git a/src/Functions/URL/protocol.h b/src/Functions/URL/protocol.h index 74c0bb820b4..c1d83192835 100644 --- a/src/Functions/URL/protocol.h +++ b/src/Functions/URL/protocol.h @@ -1,7 +1,7 @@ #pragma once -#include "FunctionsURL.h" #include +#include namespace DB @@ -54,4 +54,3 @@ struct ExtractProtocol }; } - diff --git a/src/Functions/URL/queryString.h b/src/Functions/URL/queryString.h index a0777a5c9a1..603450d102b 100644 --- a/src/Functions/URL/queryString.h +++ b/src/Functions/URL/queryString.h @@ -1,7 +1,7 @@ #pragma once -#include "FunctionsURL.h" #include +#include namespace DB diff --git a/src/Functions/URL/queryStringAndFragment.h b/src/Functions/URL/queryStringAndFragment.h index ed19cd14b74..27008388e4d 100644 --- a/src/Functions/URL/queryStringAndFragment.h +++ b/src/Functions/URL/queryStringAndFragment.h @@ -1,7 +1,7 @@ #pragma once -#include "FunctionsURL.h" #include +#include namespace DB @@ -34,4 +34,3 @@ struct ExtractQueryStringAndFragment }; } - From 4f7fd69883fe01c9e6fe5661051cd1bc9ffc49f6 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Mon, 17 Jul 2023 14:58:27 +0300 Subject: [PATCH 2/2] Added function firstLine to extract the first line from a multiline string Fixes #51172 --- .../functions/string-functions.md | 33 +++++++++++++++ .../functions/string-functions.md | 36 ++++++++++++++++ src/Functions/StringHelpers.h | 2 +- src/Functions/firstLine.cpp | 42 +++++++++++++++++++ .../0_stateless/02815_first_line.reference | 9 ++++ .../queries/0_stateless/02815_first_line.sql | 12 ++++++ .../aspell-ignore/en/aspell-dict.txt | 1 + 7 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 src/Functions/firstLine.cpp create mode 100644 tests/queries/0_stateless/02815_first_line.reference create mode 100644 tests/queries/0_stateless/02815_first_line.sql diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 4f174a53ad6..9890d257e84 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1267,3 +1267,36 @@ Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I). If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. + +## firstLine + +Returns the first line from a multi-line string. + +**Syntax** + +```sql +firstLine(val) +``` + +**Arguments** + +- `val` - Input value. [String](../data-types/string.md) + +**Returned value** + +- The first line of the input value or the whole value if there is no line + separators. [String](../data-types/string.md) + +**Example** + +```sql +select firstLine('foo\nbar\nbaz'); +``` + +Result: + +```result +┌─firstLine('foo\nbar\nbaz')─┐ +│ foo │ +└────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index b872200f99b..276dfc2ef20 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -1124,3 +1124,39 @@ Do Nothing for 2 Minutes 2:00   Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным. Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным. Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено. + +## firstLine + +Возвращает первую строку в многострочном тексте. + +**Синтаксис** + +```sql +firstLine(val) +``` + +**Аргументы** + +- `val` - текст для обработки. [String](../data-types/string.md) + +**Returned value** + +- Первая строка текста или весь текст, если переносы строк отсутствуют. + +Тип: [String](../data-types/string.md) + +**Пример** + +Запрос: + +```sql +select firstLine('foo\nbar\nbaz'); +``` + +Результат: + +```result +┌─firstLine('foo\nbar\nbaz')─┐ +│ foo │ +└────────────────────────────┘ +``` diff --git a/src/Functions/StringHelpers.h b/src/Functions/StringHelpers.h index a0f4d18aa80..8f3a87d5d0e 100644 --- a/src/Functions/StringHelpers.h +++ b/src/Functions/StringHelpers.h @@ -156,7 +156,7 @@ struct CutSubstringImpl static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported this function"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function"); } }; diff --git a/src/Functions/firstLine.cpp b/src/Functions/firstLine.cpp new file mode 100644 index 00000000000..20b47361d58 --- /dev/null +++ b/src/Functions/firstLine.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +namespace DB +{ + +struct FirstLine +{ + static size_t getReserveLengthForElement() { return 16; } + + static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size) + { + res_data = data; + + const Pos end = data + size; + const Pos pos = find_first_symbols<'\r', '\n'>(data, end); + res_size = pos - data; + } +}; + +struct NameFirstLine +{ + static constexpr auto name = "firstLine"; +}; + +using FunctionFirstLine = FunctionStringToString, NameFirstLine>; + +REGISTER_FUNCTION(FirstLine) +{ + factory.registerFunction(FunctionDocumentation{ + .description = "Returns first line of a multi-line string.", + .syntax = "firstLine(string)", + .arguments = {{.name = "string", .description = "The string to process."}}, + .returned_value = {"The first line of the string or the whole string if there is no line separators."}, + .examples = { + {.name = "Return first line", .query = "firstLine('Hello\\nWorld')", .result = "'Hello'"}, + {.name = "Return whole string", .query = "firstLine('Hello World')", .result = "'Hello World'"}, + }}); +} +} diff --git a/tests/queries/0_stateless/02815_first_line.reference b/tests/queries/0_stateless/02815_first_line.reference new file mode 100644 index 00000000000..cdc86229cc8 --- /dev/null +++ b/tests/queries/0_stateless/02815_first_line.reference @@ -0,0 +1,9 @@ +foo +foo +foo +foobarbaz +== vector +1 foo +2 quux +3 single line +4 windows diff --git a/tests/queries/0_stateless/02815_first_line.sql b/tests/queries/0_stateless/02815_first_line.sql new file mode 100644 index 00000000000..8c0affaebd3 --- /dev/null +++ b/tests/queries/0_stateless/02815_first_line.sql @@ -0,0 +1,12 @@ +select firstLine('foo\nbar\nbaz'); +select firstLine('foo\rbar\rbaz'); +select firstLine('foo\r\nbar\r\nbaz'); +select firstLine('foobarbaz'); + +select '== vector'; + +drop table if exists 02815_first_line_vector; +create table 02815_first_line_vector (n Int32, text String) engine = MergeTree order by n; + +insert into 02815_first_line_vector values (1, 'foo\nbar\nbaz'), (2, 'quux\n'), (3, 'single line'), (4, 'windows\r\nline breaks'); +select n, firstLine(text) from 02815_first_line_vector order by n; diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 6c88d63be49..57934b28728 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1428,6 +1428,7 @@ filesystemFree filesystems finalizeAggregation fips +firstLine firstSignificantSubdomain firstSignificantSubdomainCustom fixedstring