Merge pull request #56327 from rschu1ze/parseQueryOrNull

Improve `formatQuery()`
This commit is contained in:
Robert Schulze 2023-11-06 22:48:16 +01:00 committed by GitHub
commit 8846cc6770
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 230 additions and 55 deletions

View File

@ -2760,10 +2760,13 @@ message Root
Returns a formatted, possibly multi-line, version of the given SQL query.
Throws an exception if the query is not well-formed. To return `NULL` instead, function `formatQueryOrNull()` may be used.
**Syntax**
```sql
formatQuery(query)
formatQueryOrNull(query)
```
**Arguments**
@ -2796,10 +2799,13 @@ WHERE (a > 3) AND (b < 3) │
Like formatQuery() but the returned formatted string contains no line breaks.
Throws an exception if the query is not well-formed. To return `NULL` instead, function `formatQuerySingleLineOrNull()` may be used.
**Syntax**
```sql
formatQuerySingleLine(query)
formatQuerySingleLineOrNull(query)
```
**Arguments**

View File

@ -1,7 +1,9 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <IO/WriteBufferFromVector.h>
#include <IO/WriteBufferFromString.h>
#include <Interpreters/Context.h>
#include <Parsers/ParserQuery.h>
#include <Parsers/formatAST.h>
@ -15,7 +17,19 @@ namespace ErrorCodes
extern const int ILLEGAL_COLUMN;
}
template <bool one_line, typename Name>
enum class OutputFormatting
{
SingleLine,
MultiLine
};
enum class ErrorHandling
{
Exception,
Null
};
template <OutputFormatting output_formatting, ErrorHandling error_handling, typename Name>
class FunctionFormatQuery : public IFunction
{
public:
@ -27,70 +41,127 @@ public:
}
FunctionFormatQuery(size_t max_query_size_, size_t max_parser_depth_)
: max_query_size(max_query_size_), max_parser_depth(max_parser_depth_)
: max_query_size(max_query_size_)
, max_parser_depth(max_parser_depth_)
{
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors mandatory_args{{"query", &isString<IDataType>, nullptr, "String"}};
validateFunctionArgumentTypes(*this, arguments, mandatory_args);
return arguments[0].type;
FunctionArgumentDescriptors args{
{"query", &isString<IDataType>, nullptr, "String"}
};
validateFunctionArgumentTypes(*this, arguments, args);
DataTypePtr string_type = std::make_shared<DataTypeString>();
if constexpr (error_handling == ErrorHandling::Null)
return std::make_shared<DataTypeNullable>(string_type);
else
return string_type;
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const ColumnPtr column = arguments[0].column;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
const ColumnPtr col_query = arguments[0].column;
ColumnUInt8::MutablePtr col_null_map;
if constexpr (error_handling == ErrorHandling::Null)
col_null_map = ColumnUInt8::create(input_rows_count, 0);
if (const ColumnString * col_query_string = checkAndGetColumn<ColumnString>(col_query.get()))
{
auto col_res = ColumnString::create();
formatVector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
return col_res;
formatVector(col_query_string->getChars(), col_query_string->getOffsets(), col_res->getChars(), col_res->getOffsets(), col_null_map);
if constexpr (error_handling == ErrorHandling::Null)
return ColumnNullable::create(std::move(col_res), std::move(col_null_map));
else
return col_res;
}
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", col_query->getName(), getName());
}
private:
void formatQueryImpl(const char * begin, const char * end, ColumnString::Chars & output) const
{
ParserQuery parser{end};
auto ast = parseQuery(parser, begin, end, {}, max_query_size, max_parser_depth);
WriteBufferFromVector buf(output, AppendModeTag{});
formatAST(*ast, buf, /* hilite */ false, /* one_line */ one_line);
buf.finalize();
}
void formatVector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets) const
ColumnString::Offsets & res_offsets,
ColumnUInt8::MutablePtr & res_null_map) const
{
const size_t size = offsets.size();
res_offsets.resize(size);
res_data.reserve(data.size());
res_data.resize(data.size());
size_t prev_offset = 0;
size_t res_data_size = 0;
size_t prev_in_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const auto * begin = reinterpret_cast<const char *>(&data[prev_in_offset]);
const char * end = begin + offsets[i] - 1;
formatQueryImpl(begin, end, res_data);
res_offsets[i] = res_data.size() + 1;
prev_in_offset = offsets[i];
const char * begin = reinterpret_cast<const char *>(&data[prev_offset]);
const char * end = begin + offsets[i] - prev_offset - 1;
ParserQuery parser(end);
ASTPtr ast;
WriteBufferFromOwnString buf;
try
{
ast = parseQuery(parser, begin, end, /*query_description*/ {}, max_query_size, max_parser_depth);
}
catch (...)
{
if constexpr (error_handling == ErrorHandling::Null)
{
const size_t res_data_new_size = res_data_size + 1;
if (res_data_new_size > res_data.size())
res_data.resize(2 * res_data_new_size);
res_data[res_data_size] = '\0';
res_data_size += 1;
res_offsets[i] = res_data_size;
prev_offset = offsets[i];
res_null_map->getData()[i] = 1;
continue;
}
else
{
static_assert(error_handling == ErrorHandling::Exception);
throw;
}
}
formatAST(*ast, buf, /*hilite*/ false, /*single_line*/ output_formatting == OutputFormatting::SingleLine);
auto formatted = buf.stringView();
const size_t res_data_new_size = res_data_size + formatted.size() + 1;
if (res_data_new_size > res_data.size())
res_data.resize(2 * res_data_new_size);
memcpy(&res_data[res_data_size], formatted.begin(), formatted.size());
res_data_size += formatted.size();
res_data[res_data_size] = '\0';
res_data_size += 1;
res_offsets[i] = res_data_size;
prev_offset = offsets[i];
}
res_data.resize(res_data_size);
}
size_t max_query_size;
size_t max_parser_depth;
const size_t max_query_size;
const size_t max_parser_depth;
};
struct NameFormatQuery
@ -98,15 +169,25 @@ struct NameFormatQuery
static constexpr auto name = "formatQuery";
};
struct NameFormatQueryOrNull
{
static constexpr auto name = "formatQueryOrNull";
};
struct NameFormatQuerySingleLine
{
static constexpr auto name = "formatQuerySingleLine";
};
struct NameFormatQuerySingleLineOrNull
{
static constexpr auto name = "formatQuerySingleLineOrNull";
};
REGISTER_FUNCTION(formatQuery)
{
factory.registerFunction<FunctionFormatQuery<false, NameFormatQuery>>(FunctionDocumentation{
.description = "Returns a formatted, possibly multi-line, version of the given SQL query.\n[example:multiline]",
factory.registerFunction<FunctionFormatQuery<OutputFormatting::MultiLine, ErrorHandling::Exception, NameFormatQuery>>(FunctionDocumentation{
.description = "Returns a formatted, possibly multi-line, version of the given SQL query. Throws in case of a parsing error.\n[example:multiline]",
.syntax = "formatQuery(query)",
.arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}},
.returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).",
@ -121,10 +202,28 @@ REGISTER_FUNCTION(formatQuery)
.categories{"Other"}});
}
REGISTER_FUNCTION(formatQueryOrNull)
{
factory.registerFunction<FunctionFormatQuery<OutputFormatting::MultiLine, ErrorHandling::Null, NameFormatQueryOrNull>>(FunctionDocumentation{
.description = "Returns a formatted, possibly multi-line, version of the given SQL query. Returns NULL in case of a parsing error.\n[example:multiline]",
.syntax = "formatQueryOrNull(query)",
.arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}},
.returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).",
.examples{
{"multiline",
"SELECT formatQuery('select a, b FRom tab WHERE a > 3 and b < 3');",
"SELECT\n"
" a,\n"
" b\n"
"FROM tab\n"
"WHERE (a > 3) AND (b < 3)"}},
.categories{"Other"}});
}
REGISTER_FUNCTION(formatQuerySingleLine)
{
factory.registerFunction<FunctionFormatQuery<true, NameFormatQuerySingleLine>>(FunctionDocumentation{
.description = "Like formatQuery() but the returned formatted string contains no line breaks.\n[example:multiline]",
factory.registerFunction<FunctionFormatQuery<OutputFormatting::SingleLine, ErrorHandling::Exception, NameFormatQuerySingleLine>>(FunctionDocumentation{
.description = "Like formatQuery() but the returned formatted string contains no line breaks. Throws in case of a parsing error.\n[example:multiline]",
.syntax = "formatQuerySingleLine(query)",
.arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}},
.returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).",
@ -134,4 +233,19 @@ REGISTER_FUNCTION(formatQuerySingleLine)
"SELECT a, b FROM tab WHERE (a > 3) AND (b < 3)"}},
.categories{"Other"}});
}
REGISTER_FUNCTION(formatQuerySingleLineOrNull)
{
factory.registerFunction<FunctionFormatQuery<OutputFormatting::SingleLine, ErrorHandling::Null, NameFormatQuerySingleLineOrNull>>(FunctionDocumentation{
.description = "Like formatQuery() but the returned formatted string contains no line breaks. Returns NULL in case of a parsing error.\n[example:multiline]",
.syntax = "formatQuerySingleLineOrNull(query)",
.arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}},
.returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).",
.examples{
{"multiline",
"SELECT formatQuerySingleLine('select a, b FRom tab WHERE a > 3 and b < 3');",
"SELECT a, b FROM tab WHERE (a > 3) AND (b < 3)"}},
.categories{"Other"}});
}
}

View File

@ -1,19 +1,44 @@
-- formatQuery
SELECT 1
SELECT 1
SELECT 1
SELECT 1
1
1
INSERT INTO tab FORMAT Values
CREATE TABLE default.no_prop_table\n(\n `some_column` UInt64\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192
EXPLAIN SYNTAX\nSELECT\n CAST(1, \'INT\'),\n CEIL(1),\n CEILING(1),\n CHAR(49),\n CHAR_LENGTH(\'1\'),\n CHARACTER_LENGTH(\'1\'),\n COALESCE(1),\n CONCAT(\'1\', \'1\'),\n CORR(1, 1),\n COS(1),\n COUNT(1),\n COVAR_POP(1, 1),\n COVAR_SAMP(1, 1),\n DATABASE(),\n SCHEMA(),\n dateDiff(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')),\n EXP(1),\n FLATTEN([[1]]),\n FLOOR(1),\n FQDN(),\n GREATEST(1),\n IF(1, 1, 1),\n IFNULL(1, 1),\n LCASE(\'A\'),\n LEAST(1),\n LENGTH(\'1\'),\n LN(1),\n LOCATE(\'1\', \'1\'),\n LOG(1),\n LOG10(1),\n LOG2(1),\n LOWER(\'A\'),\n MAX(1),\n MID(\'123\', 1, 1),\n MIN(1),\n MOD(1, 1),\n NOT 1,\n NOW(),\n NOW64(),\n NULLIF(1, 1),\n PI(),\n position(\'123\', \'2\'),\n POW(1, 1),\n POWER(1, 1),\n RAND(),\n REPLACE(\'1\', \'1\', \'2\'),\n REVERSE(\'123\'),\n ROUND(1),\n SIN(1),\n SQRT(1),\n STDDEV_POP(1),\n STDDEV_SAMP(1),\n SUBSTR(\'123\', 2),\n substring(\'123\', 2),\n SUM(1),\n TAN(1),\n TANH(1),\n TRUNC(1),\n TRUNCATE(1),\n UCASE(\'A\'),\n UPPER(\'A\'),\n USER(),\n VAR_POP(1),\n VAR_SAMP(1),\n WEEK(toDate(\'2020-10-24\')),\n YEARWEEK(toDate(\'2020-10-24\'))\nFORMAT TSVRaw
1 SELECT 1 SELECT 1
2 SeLeCt 22 SELECT 22
3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values
1 SELECT 1 SELECT 1
2 SeLeCt 22 SELECT 22
3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values
1 SELECT 1 SELECT 1
2 SeLeCt 2 SELECT 2
3 bad 3 \N
4 select 4 SELECT 4
5 bad 5 \N
6 \N
7 SELECT 7 SELECT 7
-- formatQuerySingleLine
SELECT 1
SELECT 1
SELECT 1
1
formatQuerySingleLine
SELECT 1
SELECT 1
SELECT 1
SELECT 1
1
INSERT INTO tab FORMAT Values
CREATE TABLE default.no_prop_table (`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192
EXPLAIN SYNTAX SELECT CAST(1, \'INT\'), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), dateDiff(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT 1, NOW(), NOW64(), NULLIF(1, 1), PI(), position(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), substring(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) FORMAT TSVRaw
1
1 SELECT 1 SELECT 1
2 SeLeCt 22 SELECT 22
3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values
1 SELECT 1 SELECT 1
2 SeLeCt 22 SELECT 22
3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values
1 SELECT 1 SELECT 1
2 SeLeCt 2 SELECT 2
3 bad 3 \N
4 select 4 SELECT 4
5 bad 5 \N
6 \N
7 SELECT 7 SELECT 7

View File

@ -1,21 +1,51 @@
SELECT formatQuery('select 1;');
SELECT formatQuery('select 1');
DROP TABLE IF EXISTS all_valid;
CREATE TABLE all_valid (id UInt64, query String) ENGINE=MergeTree ORDER BY id;
INSERT INTO all_valid VALUES (1, 'SELECT 1') (2, 'SeLeCt 22') (3, 'InSerT into TAB values (\'\')');
DROP TABLE IF EXISTS some_invalid;
CREATE TABLE some_invalid (id UInt64, query String) ENGINE=MergeTree ORDER BY id;
INSERT INTO some_invalid VALUES (1, 'SELECT 1') (2, 'SeLeCt 2') (3, 'bad 3') (4, 'select 4') (5, 'bad 5') (6, '') (7, 'SELECT 7');
SELECT '-- formatQuery';
SELECT formatQuery('SELECT 1;');
SELECT formatQuery('SELECT 1');
SELECT formatQuery('select 1;') == formatQuery('SeLecT 1');
SELECT formatQuery('SeLeCt 1;');
SELECT formatQuery('select 1;') == formatQuery('SeLeCt 1');
SELECT normalizedQueryHash(formatQuery('select 1')) = normalizedQueryHash(formatQuery('SELECT 1'));
SELECT formatQuery('INSERT INTO tab VALUES (\'\') (\'test\')');
SELECT formatQuery('CREATE TABLE default.no_prop_table(`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192');
SELECT formatQuery('EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), SUBSTRING(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) format TSVRaw;');
SELECT normalizedQueryHash(formatQuery('select 1')) = normalizedQueryHash(formatQuery('SELECT 1'));
SELECT formatQuery(''); -- { serverError SYNTAX_ERROR }
SELECT formatQuery('SEECTwrong'); -- { serverError SYNTAX_ERROR }
SELECT 'formatQuerySingleLine';
SELECT formatQuerySingleLine('select 1;');
SELECT formatQuerySingleLine('select 1');
SELECT id, query, formatQuery(query) FROM all_valid ORDER BY id;
SELECT id, query, formatQuery(query) FROM some_invalid ORDER BY id; -- { serverError SYNTAX_ERROR }
SELECT id, query, formatQueryOrNull(query) FROM all_valid ORDER BY id;
SELECT id, query, formatQueryOrNull(query) FROM some_invalid ORDER BY id;
SELECT '-- formatQuerySingleLine';
SELECT formatQuerySingleLine('SELECT 1;');
SELECT formatQuerySingleLine('SELECT 1');
SELECT formatQuerySingleLine('select 1;') == formatQuerySingleLine('SeLecT 1');
SELECT formatQuerySingleLine('SeLeCt 1;');
SELECT formatQuerySingleLine('select 1;') == formatQuerySingleLine('SeLeCt 1');
SELECT normalizedQueryHash(formatQuerySingleLine('select 1')) = normalizedQueryHash(formatQuerySingleLine('SELECT 1'));
SELECT formatQuerySingleLine('INSERT INTO tab VALUES (\'\') (\'test\')');
SELECT formatQuerySingleLine('CREATE TABLE default.no_prop_table(`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192');
SELECT formatQuerySingleLine('EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), SUBSTRING(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) format TSVRaw;');
SELECT normalizedQueryHash(formatQuerySingleLine('select 1')) = normalizedQueryHash(formatQuerySingleLine('SELECT 1'));
SELECT formatQuerySingleLine(''); -- { serverError SYNTAX_ERROR }
SELECT formatQuerySingleLine('SEECTwrong'); -- { serverError SYNTAX_ERROR }
SELECT id, query, formatQuerySingleLine(query) FROM all_valid ORDER BY id;
SELECT id, query, formatQuerySingleLine(query) FROM some_invalid ORDER BY id; -- { serverError SYNTAX_ERROR }
SELECT id, query, formatQuerySingleLineOrNull(query) FROM all_valid ORDER BY id;
SELECT id, query, formatQuerySingleLineOrNull(query) FROM some_invalid ORDER BY id;
DROP TABLE all_valid;
DROP TABLE some_invalid;