This commit is contained in:
Nikita Mikhaylov 2024-03-27 19:26:19 +00:00
parent 4d37e37943
commit d18aba1194
4 changed files with 51 additions and 425 deletions

View File

@ -30,24 +30,6 @@ namespace ErrorCodes
Suggest::Suggest()
{
/// Keywords may be not up to date with ClickHouse parser.
addWords({"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON",
"CLUSTER", "DEFAULT", "MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE",
"SETTINGS", "ATTACH", "DETACH", "DROP", "RENAME", "TO", "ALTER", "ADD",
"MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT", "PRIMARY", "KEY",
"CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO",
"OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN",
"THEN", "ELSE", "END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE",
"FINAL", "DEDUPLICATE", "INSERT", "VALUES", "SELECT", "DISTINCT", "SAMPLE", "ARRAY",
"JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER", "LEFT", "RIGHT",
"FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY",
"WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND",
"OR", "ASC", "IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST",
"BETWEEN", "TRUNCATE", "USER", "ROLE", "PROFILE", "QUOTA", "POLICY", "ROW",
"GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE", "IDENTIFIED", "HOST",
"NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED", "INTERVAL",
"LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE", "CLEANUP", "APPEND",
"IGNORE NULLS", "RESPECT NULLS", "OVER", "PASTE"});
}
static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion)
@ -82,6 +64,7 @@ static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggesti
add_column("name", "data_type_families", false, {});
add_column("name", "merge_tree_settings", false, {});
add_column("name", "settings", false, {});
add_column("keyword", "keywords", false, {});
if (!basic_suggestion)
{

View File

@ -1,3 +1,5 @@
#include "Parsers/CommonParsers.h"
#include <algorithm>
#include <cassert>
#include <Parsers/obfuscateQueries.h>
@ -10,6 +12,7 @@
#include <IO/WriteBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromMemory.h>
#include <iterator>
namespace DB
@ -24,8 +27,13 @@ namespace ErrorCodes
namespace
{
const std::unordered_set<std::string_view> keywords
const std::unordered_set<std::string_view> & getObfuscateKeywords()
{
static std::unordered_set<std::string_view> instance;
auto initialize = [&]() mutable
{
instance = {
"!=",
"",
"%",
@ -51,386 +59,18 @@ const std::unordered_set<std::string_view> keywords
"]+|[",
"^[",
"||",
"]+$",
"ACCESS",
"ACTION",
"ADD",
"ADMIN",
"AFTER",
"ALGORITHM",
"ALIAS",
"ALL",
"ALLOWED_LATENESS",
"ALTER",
"AND",
"ANTI",
"ANY",
"APPLY",
"ARRAY",
"AS",
"ASC",
"ASCENDING",
"ASOF",
"ASSUME",
"AST",
"ASYNC",
"ATTACH",
"AUTO_INCREMENT",
"BACKUP",
"BASE_BACKUP",
"BEGIN",
"BETWEEN",
"BIDIRECTIONAL",
"BOTH",
"BY",
"CACHE",
"CACHES",
"CASCADE",
"CASE",
"CASEWITHEXPRESSION",
"CAST",
"CHANGE",
"CHANGEABLE_IN_READONLY",
"CHANGED",
"CHAR",
"CHARACTER",
"CHECK",
"CLEANUP",
"CLEAR",
"CLUSTER",
"CLUSTER_HOST_IDS",
"CLUSTERS",
"CN",
"CODEC",
"COLLATE",
"COLLECTION",
"COLUMN",
"COLUMNS",
"COMMENT",
"COMMIT",
"COMPRESSION",
"CONCAT",
"CONSTRAINT",
"CREATE",
"CROSS",
"CUBE",
"CURRENT",
"CURRENT_USER",
"DATABASE",
"DATABASES",
"DATE",
"DATE_ADD",
"DATEADD",
"DATE_DIFF",
"DATEDIFF",
"DATE_SUB",
"DATESUB",
"DAY",
"DD",
"DDL",
"DEDUPLICATE",
"DEFAULT",
"DELAY",
"DELETE",
"DESC",
"DESCENDING",
"DESCRIBE",
"DETACH",
"DETACHED",
"DICTIONARIES",
"DICTIONARY",
"DISK",
"DISTINCT",
"DIV",
"DOUBLE_SHA1_HASH",
"DROP",
"ELSE",
"EMPTY",
"ENABLED",
"END",
"ENFORCED",
"ENGINE",
"EPHEMERAL",
"EQUALS",
"ESTIMATE",
"EVENT",
"EVENTS",
"EXCEPT",
"EXCHANGE",
"EXISTS",
"EXPLAIN",
"EXPRESSION",
"EXTERNAL",
"EXTRACT",
"FALSE",
"FETCH",
"FILE",
"FILESYSTEM",
"FILL",
"FILTER",
"FINAL",
"FIRST",
"FOLLOWING",
"FOR",
"FOREIGN",
"FORMAT",
"FREEZE",
"FROM",
"FULL",
"FULLTEXT",
"FUNCTION",
"GLOBAL",
"GRANT",
"GRANTEES",
"GRANTS",
"GRANULARITY",
"GREATER",
"GREATEROREQUALS",
"GROUP",
"GROUPING",
"GROUPS",
"HASH",
"HAVING",
"HDFS",
"HH",
"HIERARCHICAL",
"HOST",
"HOUR",
"ID",
"IDENTIFIED",
"IF",
"ILIKE",
"IN",
"INDEX",
"INFILE",
"INHERIT",
"INJECTIVE",
"INNER",
"INSERT",
"INTERPOLATE",
"INTERSECT",
"INTERVAL",
"INTO",
"INVISIBLE",
"IP",
"IS",
"IS_OBJECT_ID",
"JOIN",
"KEY",
"KEYED",
"KILL",
"LAMBDA",
"LARGE",
"LAST",
"LAYOUT",
"LEADING",
"LEFT",
"LESS",
"LESSOREQUALS",
"LEVEL",
"LIFETIME",
"LIKE",
"LIMIT",
"LIMITS",
"LINEAR",
"LIST",
"LITERAL",
"LIVE",
"LOCAL",
"LTRIM",
"MATCH",
"MATERIALIZE",
"MATERIALIZED",
"MAX",
"MCS",
"MEMORY",
"MI",
"MICROSECOND",
"MILLISECOND",
"MIN",
"MINUS",
"MINUTE",
"MM",
"MOD",
"MODIFY",
"MONTH",
"MOVE",
"MS",
"MULTIIF",
"MUTATION",
"NAME",
"NAMED",
"NANOSECOND",
"NEXT",
"NO",
"NONE",
"NOT",
"NOTEQUALS",
"NOTIN",
"NS",
"NULL",
"NULLS",
"OBJECT",
"OFFSET",
"ON",
"ONLY",
"OPTIMIZE",
"OPTION",
"OR",
"ORDER",
"OUTER",
"OUTFILE",
"OVER",
"OVERRIDE",
"PART",
"PARTIAL",
"PARTITION",
"PARTITIONS",
"PART_MOVE_TO_SHARD",
"PERMANENTLY",
"PERMISSIVE",
"PIPELINE",
"PLAN",
"PLUS",
"POLICY",
"POPULATE",
"POSITION",
"PRECEDING",
"PRECISION",
"PREWHERE",
"PRIMARY",
"PRIVILEGES",
"PROCESSLIST",
"PROFILE",
"PROJECTION",
"QQ",
"QUARTER",
"QUERY",
"QUOTA",
"RANDOMIZED",
"RANGE",
"READONLY",
"REALM",
"RECOMPRESS",
"REFERENCES",
"REFRESH",
"REGEXP",
"REGEXPQUOTEMETA",
"REMOVE",
"RENAME",
"REPLACE",
"REPLACEREGEXPALL",
"REPLACEREGEXPONE",
"RESET",
"RESTORE",
"RESTRICT",
"RESTRICTIVE",
"RESUME",
"REVOKE",
"RIGHT",
"ROLE",
"ROLES",
"ROLLBACK",
"ROLLUP",
"ROW",
"ROWS",
"RTRIM",
"S3",
"SALT",
"SAMPLE",
"SECOND",
"SELECT",
"SEMI",
"SERVER",
"SET",
"SETS",
"SETTING",
"SETTINGS",
"SHA256_HASH",
"SHARD",
"SHOW",
"SIGNED",
"SIMPLE",
"SINGLEVALUEORNULL",
"SNAPSHOT",
"SOURCE",
"SPATIAL",
"SS",
"STDOUT",
"STEP",
"STORAGE",
"STRICT",
"STRICTLY_ASCENDING",
"SUBPARTITION",
"SUBPARTITIONS",
"SUBSTRING",
"SUSPEND",
"SYNC",
"SYNTAX",
"SYSTEM",
"TABLE",
"TABLES",
"TEMPORARY",
"TEST",
"THAN",
"THEN",
"TIES",
"TIMESTAMP",
"TIMESTAMP_ADD",
"TIMESTAMPADD",
"TIMESTAMP_DIFF",
"TIMESTAMPDIFF",
"TIMESTAMP_SUB",
"TIMESTAMPSUB",
"TO",
"TODATE",
"TODATETIME",
"TOP",
"TOTALS",
"TRACKING",
"TRAILING",
"TRANSACTION",
"TREE",
"TRIGGER",
"TRIM",
"TRIMBOTH",
"TRIMLEFT",
"TRIMRIGHT",
"TRUE",
"TRUNCATE",
"TTL",
"TUPLE",
"TYPE",
"UNBOUNDED",
"UNFREEZE",
"UNION",
"UNIQUE",
"UNSIGNED",
"UNTUPLE",
"UPDATE",
"URL",
"USE",
"USER",
"USING",
"UUID",
"VALUES",
"VARYING",
"VIEW",
"VIEWIFPERMITTED",
"VISIBLE",
"VOLUME",
"WATCH",
"WATERMARK",
"WEEK",
"WHEN",
"WHERE",
"WINDOW",
"WITH",
"WK",
"WRITABLE",
"YEAR",
"YYYY",
"ZKPATH"
"]+$"
};
auto & global_keywords = getAllKeyWords();
std::copy(global_keywords.begin(), global_keywords.end(), std::inserter(instance, instance.begin()));
return true;
};
static bool initialized = initialize();
(void) initialized;
return instance;
};
/// We want to keep some words inside quotes. For example we want to keep HOUR inside:
@ -1312,7 +952,7 @@ void obfuscateQueries(
std::string whole_token_uppercase(whole_token);
Poco::toUpperInPlace(whole_token_uppercase);
if (keywords.contains(whole_token_uppercase)
if (getObfuscateKeywords().contains(whole_token_uppercase)
|| known_identifier_func(whole_token))
{
/// Keep keywords as is.

View File

@ -9,6 +9,7 @@ test_shard_localhost: OK
default_path_test: OK
default: OK
uniqCombined64ForEach: OK
CHANGEABLE_IN_READONLY: OK
system: OK
aggregate_function_combinators: OK
primary_key_bytes_in_memory_allocated: OK

View File

@ -99,6 +99,8 @@ client_compwords_positive=(
default
# system.aggregate_function_combinators
uniqCombined64ForEach
# system.keywords
CHANGEABLE_IN_READONLY
# FIXME: one may add separate case for suggestion_limit
# system.databases