diff --git a/utils/grammar/ClickHouseLexer.g4 b/utils/grammar/ClickHouseLexer.g4 new file mode 100644 index 00000000000..766e30d2850 --- /dev/null +++ b/utils/grammar/ClickHouseLexer.g4 @@ -0,0 +1,232 @@ +lexer grammar ClickHouseLexer; + +LINE_COMMENT + : '--' ~[\r\n]* -> channel(HIDDEN) + ; + + // TOKENS, KEYWORDS + +K_ADD : A D D; +K_AFTER : A F T E R; +K_ALL : A L L; +K_ALIAS : A L I A S; +K_ALTER : A L T E R; +K_AND : A N D; +K_ANY : A N Y; +K_ARRAY : A R R A Y; +K_AS : A S; +K_ASCENDING : A S C E N D I N G; +K_ASC : A S C; +K_ASYNC : A S Y N C; +K_ATTACH : A T T A C H; +K_BETWEEN : B E T W E E N; +K_BY : B Y; +K_CASE : C A S E; +K_CAST : C A S T; +K_CHECK : C H E C K; +K_CLUSTER : C L U S T E R; +K_COLUMN : C O L U M N; +K_COLLATE : C O L L A T E; +K_CREATE : C R E A T E; +K_CROSS : C R O S S; +K_DAY : D A Y; +K_DESCRIBE : D E S C R I B E; +K_DESCENDING : D E S C E N D I N G; +K_DESC : D E S C; +K_DATABASE : D A T A B A S E; +K_DATABASES : D A T A B A S E S; +K_DEFAULT : D E F A U L T; +K_DETACH : D E T A C H; +K_DISTINCT : D I S T I N C T; +K_DROP : D R O P; +K_ELSE : E L S E; +K_END : E N D; +K_ENGINE : E N G I N E; +K_EXISTS : E X I S T S; +K_FETCH : F E T C H; +K_FINAL : F I N A L; +K_FIRST : F I R S T; +K_FROM : F R O M; +K_FREEZE : F R E E Z E; +K_FORMAT : F O R M A T; +K_FULL : F U L L; +K_GLOBAL : G L O B A L; +K_GROUP : G R O U P; +K_HAVING : H A V I N G; +K_HOUR : H O U R; +K_ID : I D; +K_IF : I F; +K_INNER : I N N E R; +K_INSERT : I N S E R T; +K_INTERVAL : I N T E R V A L; +K_INTO : I N T O; +K_IN : I N; +K_IS : I S; +K_JOIN : J O I N; +K_KILL: K I L L; +K_LAST : L A S T; +K_LEFT : L E F T; +K_LIKE : L I K E; +K_LIMIT : L I M I T; +K_MAIN : M A I N; // not a clickhouse reverved word +K_MATERIALIZED : M A T E R I A L I Z E D; +K_MINUTE : M I N U T E; +K_MODIFY : M O D I F Y; +K_MONTH : M O N T H; +K_NOT : N O T; +K_NULL : N U L L; +K_NULLS : N U L L S; +K_OFFSET : O F F S E T; +K_ON : O N; +K_OPTIMIZE : O P T I M I Z E; +K_ORDER : O R D E R; +K_OR : O R; +K_OUTFILE : O U T F I L E; +K_PARTITION : P A R T I T I O N; +K_POPULATE : P O P U L A T E; +K_PREWHERE : P R E W H E R E; +K_PROCESSLIST : P R O C E S S L I S T; +K_QUERY : Q U E R Y; +K_RENAME : R E N A M E; +K_RETURN : R E T U R N; // not a clickhouse reverved word +K_RIGHT : R I G H T; +K_SAMPLE : S A M P L E; +K_SECOND : S E C O N D; +K_SELECT : S E L E C T; +K_SET : S E T; +K_SETTINGS : S E T T I N G S; +K_SHOW : S H O W; +K_SYNC : S Y N C; +K_TABLE : T A B L E; +K_TABLES : T A B L E S; +K_TEMPORARY : T E M P O R A R Y; +K_TEST : T E S T; +K_THEN : T H E N; +K_TOTALS : T O T A L S; +K_TO : T O; +K_OUTER: O U T E R; +K_VALUES : V A L U E S; +K_VIEW : V I E W; +K_UNION : U N I O N; +K_USE : U S E; +K_USING : U S I N G; +K_WEEK : W E E K; +K_WHEN : W H E N; +K_WHERE : W H E R E; +K_WITH : W I T H; +K_YEAR : Y E A R; + +COLON : ':' ; +COMMA : ',' ; +SEMI : ';' ; +LPAREN : '(' ; +RPAREN : ')' ; +RARROW : '->' ; +LT : '<' ; +GT : '>' ; +QUESTION : '?' ; +STAR : '*' ; +PLUS : '+' ; +CONCAT : '||' ; +OR : '|' ; +DOLLAR : '$' ; +DOT : '.' ; +PERCENT : '%' ; +MINUS : '-' ; +DIVIDE : '/' ; +EQUALS : '==' ; +ASSIGN : '=' ; +NOT_EQUALS : '!=' ; +NOT_EQUALS2 : '<>' ; +LE : '<=' ; +GE : '>=' ; +LBRAKET : '[' ; +RBRAKET : ']' ; +LCURLY : '{' ; +RCURLY : '}' ; + + +T_ARRAY : 'Array' ; +T_TUPLE : 'Tuple' ; +T_NULLABLE : 'Nullable' ; +T_FLOAT32 : 'Float32' ; +T_FLOAT64 : 'Float64' ; +T_UINT8 : 'UInt8' ; +T_UINT16 : 'UInt16' ; +T_UINT32 : 'UInt32' ; +T_UINT64 : 'UInt64' ; +T_INT8 : 'Int8' ; +T_INT16 : 'Int16' ; +T_INT32 : 'Int32' ; +T_INT64 : 'Int64' ; +T_ENUM8 : 'Enum8' ; +T_ENUM16 : 'Enum16' ; +T_UUID : 'UUID' ; +T_DATE : 'Date' ; +T_DATETIME : 'DateTime' ; +T_STRING : 'String' ; +T_FIXEDSTRING : 'FixedString' ; +T_NULL : 'Null' ; +T_INTERVAL_YEAR : 'IntervalYear' ; +T_INTERVAL_MONTH : 'IntervalMonth' ; +T_INTERVAL_WEEK : 'IntervalWeek' ; +T_INTERVAL_DAY : 'IntervalDay' ; +T_INTERVAL_HOUR : 'IntervalHour' ; +T_INTERVAL_MINUTE : 'IntervalMinute' ; +T_INTERVAL_SECOND : 'IntervalSecond' ; +T_AGGREGATE_FUNCTION : 'AggregateFunction' ; +// lambda type has unknown name. + +IDENTIFIER + : [a-zA-Z_] [a-zA-Z_0-9]* + ; + +NUMERIC_LITERAL + : DIGIT+ ( '.' DIGIT* )? ( E [-+]? DIGIT+ )? + | '.' DIGIT+ ( E [-+]? DIGIT+ )? + ; + +STRING_LITERAL + : '\'' ( ~'\'' | '\\\'' )* '\'' + ; + +QUOTED_LITERAL + : '`' ( ~'`' )* '`' + ; + +SPACES + : [ \u000B\t\r\n] -> channel(HIDDEN) + ; + +UNEXPECTED_CHAR + : . + ; + +fragment DIGIT : [0-9]; + +fragment A : [aA]; +fragment B : [bB]; +fragment C : [cC]; +fragment D : [dD]; +fragment E : [eE]; +fragment F : [fF]; +fragment G : [gG]; +fragment H : [hH]; +fragment I : [iI]; +fragment J : [jJ]; +fragment K : [kK]; +fragment L : [lL]; +fragment M : [mM]; +fragment N : [nN]; +fragment O : [oO]; +fragment P : [pP]; +fragment Q : [qQ]; +fragment R : [rR]; +fragment S : [sS]; +fragment T : [tT]; +fragment U : [uU]; +fragment V : [vV]; +fragment W : [wW]; +fragment X : [xX]; +fragment Y : [yY]; +fragment Z : [zZ]; diff --git a/utils/grammar/ClickHouseParser.g4 b/utils/grammar/ClickHouseParser.g4 new file mode 100644 index 00000000000..fa00d29d704 --- /dev/null +++ b/utils/grammar/ClickHouseParser.g4 @@ -0,0 +1,584 @@ +parser grammar ClickHouseParser; + +options { + tokenVocab=ClickHouseLexer; +} + +// эта грамматика написана по сорсам парсеров, имена правил примерно соответствуют парсерам в cpp. +// известные расхождения +// 1. скобки не обязательно сразу идут после имени функции. +// 2. многословные токены поделены на самостоятельные слова +// 3. для INSERT запроса не написана часть парсинга значений. +// 4. правило для expr переписано чтобы понизить глубину AST и сразу выходить на уровень expr - al + +parse + : ( query | error ) EOF + ; + +query + : show_tables_query + | select_query + | insert_query + | create_query + | rename_query + | drop_query + | alter_query + | use_query + | set_query + | optimize_query + | table_properties_query + | show_processlist_query + | check_query + | kill_query_query + ; + +// 1. QUERIES + +select_query + : select_query_main ( K_UNION K_ALL select_query_main ) * + query_outfile_step? + select_format_step? + ; + +select_query_main + : select_with_step + select_select_step select_from_step? + K_FINAL? select_sample_step? + select_array_join_step? select_join_step? + select_prewhere_step? select_where_step? + select_groupby_step? select_having_step? + select_orderby_step? + select_limitby_step? select_limit_step? + select_settings_step? + ; + +select_with_step + : K_WITH select_expr_list + ; + +select_select_step + : K_SELECT K_DISTINCT? select_expr_list + ; + +select_from_step + : K_FROM ( full_table_name + | table_function + | subquery + ) select_alias? + ; + +select_array_join_step + : K_LEFT? K_ARRAY K_JOIN not_empty_expression_list + ; + +select_sample_step + : K_SAMPLE sample_ratio (K_OFFSET sample_ratio ) ? + ; + +sample_ratio + : NUMERIC_LITERAL ( DIVIDE NUMERIC_LITERAL ) ? + ; + +select_join_step + : K_GLOBAL? + ( K_ANY | K_ALL ) ( K_INNER | K_LEFT K_OUTER? | K_RIGHT K_OUTER? | K_FULL K_OUTER? ) K_JOIN select_join_right_part + ( K_USING LPAREN not_empty_expression_list RPAREN + | K_USING not_empty_expression_list + // | K_ON expr на самом деле нет. + ) + | K_GLOBAL? K_CROSS K_JOIN select_join_right_part + ; + +select_join_right_part + : identifier + | subquery + ; + +select_prewhere_step + : K_PREWHERE expression_with_optional_alias + ; + +select_where_step + : K_WHERE expression_with_optional_alias + ; + +select_groupby_step + : K_GROUP K_BY not_empty_expression_list ( K_WITH K_TOTALS ) ? + ; + +select_having_step + : K_HAVING expression_with_optional_alias + ; + +select_orderby_step + : K_ORDER K_BY order_by_expression_list + ; + +select_limit_step + : K_LIMIT NUMERIC_LITERAL ( COMMA NUMERIC_LITERAL )? + ; + +select_limitby_step + : K_LIMIT NUMERIC_LITERAL K_BY not_empty_expression_list + ; + +select_settings_step + : K_SETTINGS assignment_list + ; + +select_format_step + : K_FORMAT identifier + ; + +insert_query + : K_INSERT K_INTO full_table_name + ( K_ID ASSIGN STRING_LITERAL )? // wtf? + ( LPAREN column_name_list RPAREN )? + ( K_VALUES LPAREN literal (COMMA literal )* RPAREN(COMMA LPAREN literal (COMMA literal )* RPAREN)* // ch тут дальше не парсит. а я написал скобки + | K_FORMAT format_name // ch тут дальше не парсит, только доедает все пробелы или один перевод строки. pushMode() + | select_query ) + ; + +create_query + : ( K_CREATE | K_ATTACH ) K_TEMPORARY? + ( K_DATABASE ( K_IF K_NOT K_EXISTS ) ? database_name + | K_TABLE ( K_IF K_NOT K_EXISTS ) ? full_table_name ( K_ON K_CLUSTER cluster_name ) ? + ( LPAREN column_declaration_list RPAREN engine ( K_AS select_query ) ? // если VIEW - то есть и колонки и select. + | engine K_AS ( select_query + | full_table_name engine? // wtf + ) + ) + | K_MATERIALIZED? K_VIEW ( K_IF K_NOT K_EXISTS ) ? full_table_name + ( LPAREN column_declaration_list RPAREN ) ? engine? K_POPULATE? K_AS select_query + ) + ; + +rename_query + : K_RENAME K_TABLE full_table_name K_TO full_table_name ( COMMA full_table_name K_TO full_table_name )* ( K_ON K_CLUSTER cluster_name ) ? + ; + +drop_query + : ( K_DROP | K_DETACH ) + ( K_DATABASE ( K_IF K_EXISTS ) ? database_name ( K_ON K_CLUSTER cluster_name ) ? + | K_TABLE ( K_IF K_EXISTS ) ? full_table_name ( K_ON K_CLUSTER cluster_name ) ? + ) + ; + +alter_query + : K_ALTER K_TABLE full_table_name ( K_ON K_CLUSTER cluster_name ) ? + alter_query_element ( COMMA alter_query_element ) * + ; + +alter_query_element + : K_ADD K_COLUMN compound_name_type_pair ( K_AFTER column_name ) ? + | K_DROP K_COLUMN column_name + | K_MODIFY K_COLUMN compound_name_type_pair + | K_ATTACH K_PARTITION partition_name + | K_DETACH K_PARTITION partition_name + | K_DROP K_PARTITION partition_name + | K_FETCH K_PARTITION partition_name K_FROM STRING_LITERAL + | K_FREEZE K_PARTITION partition_name + ; + +clickhouse_type + : simple_type + | T_AGGREGATE_FUNCTION LPAREN function_name ( COMMA clickhouse_type ) * RPAREN + | T_ARRAY LPAREN clickhouse_type RPAREN + | T_TUPLE LPAREN clickhouse_type ( COMMA clickhouse_type ) * RPAREN + | T_NULLABLE LPAREN clickhouse_type LPAREN + ; + +simple_type + : T_UINT8 + | T_UINT16 + | T_UINT32 + | T_UINT64 + | T_INT8 + | T_INT16 + | T_INT32 + | T_INT64 + | T_FLOAT32 + | T_FLOAT64 + | T_ENUM8 LPAREN enum_entry ( COMMA enum_entry ) * LPAREN + | T_ENUM16 LPAREN enum_entry ( COMMA enum_entry ) * LPAREN + | T_UUID + | T_DATE + | T_DATETIME + | T_STRING + | T_INTERVAL_YEAR + | T_INTERVAL_MONTH + | T_INTERVAL_WEEK + | T_INTERVAL_DAY + | T_INTERVAL_HOUR + | T_INTERVAL_MINUTE + | T_INTERVAL_SECOND + | T_NULL + | T_FIXEDSTRING LPAREN NUMERIC_LITERAL LPAREN + ; + +enum_entry + : STRING_LITERAL ASSIGN NUMERIC_LITERAL + ; + +use_query + : K_USE database_name + ; + +set_query + : K_SET K_GLOBAL? assignment_list + ; + +assignment_list + : assignment ( COMMA assignment ) * + ; + +assignment + : identifier ASSIGN literal + ; + +kill_query_query + : K_KILL K_QUERY K_WHERE expression_with_optional_alias ( K_SYNC | K_ASYNC | K_TEST ) + ; + +optimize_query + : K_OPTIMIZE K_TABLE full_table_name ( K_PARTITION STRING_LITERAL ) ? K_FINAL? + ; + +table_properties_query + : ( K_EXISTS | ( K_DESCRIBE | K_DESC ) | K_SHOW K_CREATE ) K_TABLE full_table_name query_outfile_step? ( K_FORMAT format_name ) ? + ; + +show_tables_query + : K_SHOW ( K_DATABASES + | K_TABLES ( K_FROM database_name ) ? ( K_NOT? K_LIKE STRING_LITERAL ) ? ) + query_outfile_step? + ( K_FORMAT format_name ) ? + ; + +show_processlist_query + : K_SHOW K_PROCESSLIST query_outfile_step? ( K_FORMAT format_name ) ? + ; + +check_query + : K_CHECK K_TABLE full_table_name + ; + +// 2. QUERY ELEMENTS + +full_table_name + : ( database_name DOT ) ? table_name + ; + +partition_name + : identifier | STRING_LITERAL + ; + +cluster_name + : identifier | STRING_LITERAL + ; + +database_name + : identifier + ; + +table_name + : identifier + ; + +format_name + : identifier + ; + +query_outfile_step + : K_INTO K_OUTFILE STRING_LITERAL + ; + +engine + : K_ENGINE ASSIGN identifier_with_optional_parameters + ; + +identifier_with_optional_parameters + : identifier_with_parameters + | identifier + ; + +identifier_with_parameters + : function + | nested_table + ; + +order_by_expression_list + : order_by_element ( COMMA order_by_element ) * + ; + +order_by_element + : expression_with_optional_alias ( K_DESC | K_DESCENDING | K_ASC | K_ASCENDING ) ? ( K_NULLS ( K_FIRST | K_LAST ) ) ? ( K_COLLATE STRING_LITERAL ) ? + ; + +nested_table + : identifier LPAREN name_type_pair_list RPAREN + ; + +name_type_pair_list + : name_type_pair ( COMMA name_type_pair ) * + ; + +name_type_pair + : identifier column_type + ; + +compound_name_type_pair + : compound_identifier column_type + ; + +column_declaration_list + : column_declaration ( COMMA column_declaration ) * + ; + +column_declaration + : column_name + ( ( K_DEFAULT | K_MATERIALIZED | K_ALIAS ) expr + | column_type + ) + ; + +column_name + : identifier + ; + +column_type + : clickhouse_type + ; + +column_name_list + : column_name ( COMMA column_name ) * + ; + +select_expr_list + : select_expr ( COMMA select_expr) * + ; + +select_expr + : expr select_alias? + ; + +select_alias + : K_AS? alias_name + ; + +alias + : K_AS alias_name + ; + +alias_name + : identifier + ; + +table_function + : function + ; + + +subquery + : LPAREN select_query_main RPAREN + ; + +expression_with_optional_alias + : expr alias? + ; + +// EXPRESSIONS + +expr + : LPAREN expr RPAREN # ExprParen + | function # ExprFunction + | K_CASE expr? ( K_WHEN expr K_THEN expr ) ( K_WHEN expr K_THEN expr ) * K_ELSE expr K_END # ExprCase + | expr DOT expr # ExprTupleElement + | expr LBRAKET expr RBRAKET # ExprArrayElement + | MINUS expr # ExprUnaryMinus + | K_CAST LPAREN expr K_AS clickhouse_type RPAREN # ExprCast + | expr ( STAR | DIVIDE | PERCENT ) expr # ExprMul + | expr ( PLUS | MINUS ) expr # ExprAdd + | expr CONCAT expr # ExprConcat + | expr K_BETWEEN expr K_AND expr # ExprBetween + | expr ( EQUALS | ASSIGN | NOT_EQUALS | NOT_EQUALS2 | LE | GE | LT | GT | K_LIKE | K_NOT K_LIKE ) expr # ExprLogical + | expr ( K_IN | K_NOT K_IN | K_GLOBAL K_IN | K_GLOBAL K_NOT K_IN ) expr # ExprIn + | expr ( K_IS K_NULL | K_IS K_NOT K_NULL ) # ExprIsNull + | K_INTERVAL expr interval_unit # ExprInterval + | K_NOT expr # ExprNot + | expr K_AND expr # ExprAnd + | expr K_OR expr # ExprOr + | expr QUESTION expr COLON expr # ExprTernary + | ( LPAREN identifier ( COMMA identifier )* RPAREN | identifier ( COMMA identifier )* ) RARROW expr # ExprLambda + | subquery # ExprSubquery + | LPAREN not_empty_expression_list RPAREN # ExprList + | array # ExprArray + | literal # ExprLiteral + | compound_identifier # ExprId + | STAR # ExprStar + | expr alias # ExprWithAlias + ; + +interval_unit + : K_YEAR + | K_MONTH + | K_WEEK + | K_DAY + | K_HOUR + | K_MINUTE + | K_SECOND + ; +expression_list + : ( not_empty_expression_list )? + ; + +not_empty_expression_list + : expr ( COMMA expr )* + ; + +array + : LBRAKET expression_list RBRAKET + ; + +function + : function_name function_parameters? function_arguments + ; + +function_parameters + : LPAREN ( expr ( COMMA expr )* )? RPAREN + ; +function_arguments + : LPAREN ( expr ( COMMA expr )* )? RPAREN + ; + +function_name + : identifier + ; + +identifier + : QUOTED_LITERAL + | IDENTIFIER + // в данном случае мы разрешаем ключевым словам выступать в качестве имен колонок или функций. + | simple_type + | keyword + ; + +keyword + : K_ADD + | K_AFTER + | K_ALL + | K_ALIAS + | K_ALTER + | K_AND + | K_ANY + | K_ARRAY + | K_AS + | K_ASCENDING + | K_ASC + | K_ASYNC + | K_ATTACH + | K_BETWEEN + | K_BY + | K_CASE + | K_CHECK + | K_COLUMN + | K_COLLATE + | K_CREATE + | K_CROSS + | K_DESCRIBE + | K_DESCENDING + | K_DESC + | K_DATABASE + | K_DATABASES + | K_DEFAULT + | K_DETACH + | K_DISTINCT + | K_DROP + | K_ENGINE + | K_ELSE + | K_END + | K_EXISTS + | K_FINAL + | K_FIRST + | K_FROM + | K_FORMAT + | K_FULL + | K_GLOBAL + | K_GROUP + | K_HAVING + | K_ID + | K_IF + | K_INNER + | K_INSERT + | K_INTO + | K_IN + | K_IS + | K_JOIN + | K_KILL + | K_LAST + | K_LEFT + | K_LIKE + | K_LIMIT + | K_MAIN + | K_MATERIALIZED + | K_MODIFY + | K_NOT + | K_NULL + | K_NULLS + | K_OFFSET + | K_ON + | K_OPTIMIZE + | K_ORDER + | K_OR + | K_OUTFILE + | K_PARTITION + | K_POPULATE + | K_PREWHERE + | K_PROCESSLIST + | K_QUERY + | K_RENAME + | K_RETURN + | K_RIGHT + | K_SAMPLE + | K_SELECT + | K_SET + | K_SETTINGS + | K_SHOW + | K_SYNC + | K_TABLE + | K_TABLES + | K_TEMPORARY + | K_TEST + | K_THEN + | K_TOTALS + | K_TO + | K_OUTER + | K_VALUES + | K_VIEW + | K_UNION + | K_USE + | K_USING + | K_WHEN + | K_WHERE + | K_WITH + ; + +compound_identifier +: identifier DOT identifier +| identifier +; + + +literal + : K_NULL + | NUMERIC_LITERAL + | STRING_LITERAL + ; + +error + : UNEXPECTED_CHAR + { + throw new RuntimeException("UNEXPECTED_CHAR=" + $UNEXPECTED_CHAR.text); + } + ; + diff --git a/utils/grammar/README.md b/utils/grammar/README.md new file mode 100644 index 00000000000..b4f34054b18 --- /dev/null +++ b/utils/grammar/README.md @@ -0,0 +1,7 @@ +=== ClickHouse grammar for ANTLR4 + +Authors: Yuriy Galitskiy (orantius, https://github.com/duremar), Sergey Serebryanik (serebrserg, https://github.com/serebrserg), Efim Pyshnograev (graev). + +Initially developed for Yandex.Metrica product and published under Apache 2.0 license with permission from Yandex. It has also found its usage in DataGrip product. + +It is not used in ClickHouse directly and is not synchronized with ClickHouse C++ code. Neither supported or tested. Any help welcome.