Added ANTLR4 grammar #7595

This commit is contained in:
Alexey Milovidov 2019-11-03 14:10:03 +03:00
parent d08bfbd3e1
commit c23a1cb9a5
3 changed files with 823 additions and 0 deletions

View File

@ -0,0 +1,232 @@
lexer grammar ClickHouseLexer;
LINE_COMMENT
: '--' ~[\r\n]* -> channel(HIDDEN)
;
// TOKENS, KEYWORDS
K_ADD : A D D;
K_AFTER : A F T E R;
K_ALL : A L L;
K_ALIAS : A L I A S;
K_ALTER : A L T E R;
K_AND : A N D;
K_ANY : A N Y;
K_ARRAY : A R R A Y;
K_AS : A S;
K_ASCENDING : A S C E N D I N G;
K_ASC : A S C;
K_ASYNC : A S Y N C;
K_ATTACH : A T T A C H;
K_BETWEEN : B E T W E E N;
K_BY : B Y;
K_CASE : C A S E;
K_CAST : C A S T;
K_CHECK : C H E C K;
K_CLUSTER : C L U S T E R;
K_COLUMN : C O L U M N;
K_COLLATE : C O L L A T E;
K_CREATE : C R E A T E;
K_CROSS : C R O S S;
K_DAY : D A Y;
K_DESCRIBE : D E S C R I B E;
K_DESCENDING : D E S C E N D I N G;
K_DESC : D E S C;
K_DATABASE : D A T A B A S E;
K_DATABASES : D A T A B A S E S;
K_DEFAULT : D E F A U L T;
K_DETACH : D E T A C H;
K_DISTINCT : D I S T I N C T;
K_DROP : D R O P;
K_ELSE : E L S E;
K_END : E N D;
K_ENGINE : E N G I N E;
K_EXISTS : E X I S T S;
K_FETCH : F E T C H;
K_FINAL : F I N A L;
K_FIRST : F I R S T;
K_FROM : F R O M;
K_FREEZE : F R E E Z E;
K_FORMAT : F O R M A T;
K_FULL : F U L L;
K_GLOBAL : G L O B A L;
K_GROUP : G R O U P;
K_HAVING : H A V I N G;
K_HOUR : H O U R;
K_ID : I D;
K_IF : I F;
K_INNER : I N N E R;
K_INSERT : I N S E R T;
K_INTERVAL : I N T E R V A L;
K_INTO : I N T O;
K_IN : I N;
K_IS : I S;
K_JOIN : J O I N;
K_KILL: K I L L;
K_LAST : L A S T;
K_LEFT : L E F T;
K_LIKE : L I K E;
K_LIMIT : L I M I T;
K_MAIN : M A I N; // not a clickhouse reverved word
K_MATERIALIZED : M A T E R I A L I Z E D;
K_MINUTE : M I N U T E;
K_MODIFY : M O D I F Y;
K_MONTH : M O N T H;
K_NOT : N O T;
K_NULL : N U L L;
K_NULLS : N U L L S;
K_OFFSET : O F F S E T;
K_ON : O N;
K_OPTIMIZE : O P T I M I Z E;
K_ORDER : O R D E R;
K_OR : O R;
K_OUTFILE : O U T F I L E;
K_PARTITION : P A R T I T I O N;
K_POPULATE : P O P U L A T E;
K_PREWHERE : P R E W H E R E;
K_PROCESSLIST : P R O C E S S L I S T;
K_QUERY : Q U E R Y;
K_RENAME : R E N A M E;
K_RETURN : R E T U R N; // not a clickhouse reverved word
K_RIGHT : R I G H T;
K_SAMPLE : S A M P L E;
K_SECOND : S E C O N D;
K_SELECT : S E L E C T;
K_SET : S E T;
K_SETTINGS : S E T T I N G S;
K_SHOW : S H O W;
K_SYNC : S Y N C;
K_TABLE : T A B L E;
K_TABLES : T A B L E S;
K_TEMPORARY : T E M P O R A R Y;
K_TEST : T E S T;
K_THEN : T H E N;
K_TOTALS : T O T A L S;
K_TO : T O;
K_OUTER: O U T E R;
K_VALUES : V A L U E S;
K_VIEW : V I E W;
K_UNION : U N I O N;
K_USE : U S E;
K_USING : U S I N G;
K_WEEK : W E E K;
K_WHEN : W H E N;
K_WHERE : W H E R E;
K_WITH : W I T H;
K_YEAR : Y E A R;
COLON : ':' ;
COMMA : ',' ;
SEMI : ';' ;
LPAREN : '(' ;
RPAREN : ')' ;
RARROW : '->' ;
LT : '<' ;
GT : '>' ;
QUESTION : '?' ;
STAR : '*' ;
PLUS : '+' ;
CONCAT : '||' ;
OR : '|' ;
DOLLAR : '$' ;
DOT : '.' ;
PERCENT : '%' ;
MINUS : '-' ;
DIVIDE : '/' ;
EQUALS : '==' ;
ASSIGN : '=' ;
NOT_EQUALS : '!=' ;
NOT_EQUALS2 : '<>' ;
LE : '<=' ;
GE : '>=' ;
LBRAKET : '[' ;
RBRAKET : ']' ;
LCURLY : '{' ;
RCURLY : '}' ;
T_ARRAY : 'Array' ;
T_TUPLE : 'Tuple' ;
T_NULLABLE : 'Nullable' ;
T_FLOAT32 : 'Float32' ;
T_FLOAT64 : 'Float64' ;
T_UINT8 : 'UInt8' ;
T_UINT16 : 'UInt16' ;
T_UINT32 : 'UInt32' ;
T_UINT64 : 'UInt64' ;
T_INT8 : 'Int8' ;
T_INT16 : 'Int16' ;
T_INT32 : 'Int32' ;
T_INT64 : 'Int64' ;
T_ENUM8 : 'Enum8' ;
T_ENUM16 : 'Enum16' ;
T_UUID : 'UUID' ;
T_DATE : 'Date' ;
T_DATETIME : 'DateTime' ;
T_STRING : 'String' ;
T_FIXEDSTRING : 'FixedString' ;
T_NULL : 'Null' ;
T_INTERVAL_YEAR : 'IntervalYear' ;
T_INTERVAL_MONTH : 'IntervalMonth' ;
T_INTERVAL_WEEK : 'IntervalWeek' ;
T_INTERVAL_DAY : 'IntervalDay' ;
T_INTERVAL_HOUR : 'IntervalHour' ;
T_INTERVAL_MINUTE : 'IntervalMinute' ;
T_INTERVAL_SECOND : 'IntervalSecond' ;
T_AGGREGATE_FUNCTION : 'AggregateFunction' ;
// lambda type has unknown name.
IDENTIFIER
: [a-zA-Z_] [a-zA-Z_0-9]*
;
NUMERIC_LITERAL
: DIGIT+ ( '.' DIGIT* )? ( E [-+]? DIGIT+ )?
| '.' DIGIT+ ( E [-+]? DIGIT+ )?
;
STRING_LITERAL
: '\'' ( ~'\'' | '\\\'' )* '\''
;
QUOTED_LITERAL
: '`' ( ~'`' )* '`'
;
SPACES
: [ \u000B\t\r\n] -> channel(HIDDEN)
;
UNEXPECTED_CHAR
: .
;
fragment DIGIT : [0-9];
fragment A : [aA];
fragment B : [bB];
fragment C : [cC];
fragment D : [dD];
fragment E : [eE];
fragment F : [fF];
fragment G : [gG];
fragment H : [hH];
fragment I : [iI];
fragment J : [jJ];
fragment K : [kK];
fragment L : [lL];
fragment M : [mM];
fragment N : [nN];
fragment O : [oO];
fragment P : [pP];
fragment Q : [qQ];
fragment R : [rR];
fragment S : [sS];
fragment T : [tT];
fragment U : [uU];
fragment V : [vV];
fragment W : [wW];
fragment X : [xX];
fragment Y : [yY];
fragment Z : [zZ];

View File

@ -0,0 +1,584 @@
parser grammar ClickHouseParser;
options {
tokenVocab=ClickHouseLexer;
}
// эта грамматика написана по сорсам парсеров, имена правил примерно соответствуют парсерам в cpp.
// известные расхождения
// 1. скобки не обязательно сразу идут после имени функции.
// 2. многословные токены поделены на самостоятельные слова
// 3. для INSERT запроса не написана часть парсинга значений.
// 4. правило для expr переписано чтобы понизить глубину AST и сразу выходить на уровень expr - al
parse
: ( query | error ) EOF
;
query
: show_tables_query
| select_query
| insert_query
| create_query
| rename_query
| drop_query
| alter_query
| use_query
| set_query
| optimize_query
| table_properties_query
| show_processlist_query
| check_query
| kill_query_query
;
// 1. QUERIES
select_query
: select_query_main ( K_UNION K_ALL select_query_main ) *
query_outfile_step?
select_format_step?
;
select_query_main
: select_with_step
select_select_step select_from_step?
K_FINAL? select_sample_step?
select_array_join_step? select_join_step?
select_prewhere_step? select_where_step?
select_groupby_step? select_having_step?
select_orderby_step?
select_limitby_step? select_limit_step?
select_settings_step?
;
select_with_step
: K_WITH select_expr_list
;
select_select_step
: K_SELECT K_DISTINCT? select_expr_list
;
select_from_step
: K_FROM ( full_table_name
| table_function
| subquery
) select_alias?
;
select_array_join_step
: K_LEFT? K_ARRAY K_JOIN not_empty_expression_list
;
select_sample_step
: K_SAMPLE sample_ratio (K_OFFSET sample_ratio ) ?
;
sample_ratio
: NUMERIC_LITERAL ( DIVIDE NUMERIC_LITERAL ) ?
;
select_join_step
: K_GLOBAL?
( K_ANY | K_ALL ) ( K_INNER | K_LEFT K_OUTER? | K_RIGHT K_OUTER? | K_FULL K_OUTER? ) K_JOIN select_join_right_part
( K_USING LPAREN not_empty_expression_list RPAREN
| K_USING not_empty_expression_list
// | K_ON expr на самом деле нет.
)
| K_GLOBAL? K_CROSS K_JOIN select_join_right_part
;
select_join_right_part
: identifier
| subquery
;
select_prewhere_step
: K_PREWHERE expression_with_optional_alias
;
select_where_step
: K_WHERE expression_with_optional_alias
;
select_groupby_step
: K_GROUP K_BY not_empty_expression_list ( K_WITH K_TOTALS ) ?
;
select_having_step
: K_HAVING expression_with_optional_alias
;
select_orderby_step
: K_ORDER K_BY order_by_expression_list
;
select_limit_step
: K_LIMIT NUMERIC_LITERAL ( COMMA NUMERIC_LITERAL )?
;
select_limitby_step
: K_LIMIT NUMERIC_LITERAL K_BY not_empty_expression_list
;
select_settings_step
: K_SETTINGS assignment_list
;
select_format_step
: K_FORMAT identifier
;
insert_query
: K_INSERT K_INTO full_table_name
( K_ID ASSIGN STRING_LITERAL )? // wtf?
( LPAREN column_name_list RPAREN )?
( K_VALUES LPAREN literal (COMMA literal )* RPAREN(COMMA LPAREN literal (COMMA literal )* RPAREN)* // ch тут дальше не парсит. а я написал скобки
| K_FORMAT format_name // ch тут дальше не парсит, только доедает все пробелы или один перевод строки. pushMode()
| select_query )
;
create_query
: ( K_CREATE | K_ATTACH ) K_TEMPORARY?
( K_DATABASE ( K_IF K_NOT K_EXISTS ) ? database_name
| K_TABLE ( K_IF K_NOT K_EXISTS ) ? full_table_name ( K_ON K_CLUSTER cluster_name ) ?
( LPAREN column_declaration_list RPAREN engine ( K_AS select_query ) ? // если VIEW - то есть и колонки и select.
| engine K_AS ( select_query
| full_table_name engine? // wtf
)
)
| K_MATERIALIZED? K_VIEW ( K_IF K_NOT K_EXISTS ) ? full_table_name
( LPAREN column_declaration_list RPAREN ) ? engine? K_POPULATE? K_AS select_query
)
;
rename_query
: K_RENAME K_TABLE full_table_name K_TO full_table_name ( COMMA full_table_name K_TO full_table_name )* ( K_ON K_CLUSTER cluster_name ) ?
;
drop_query
: ( K_DROP | K_DETACH )
( K_DATABASE ( K_IF K_EXISTS ) ? database_name ( K_ON K_CLUSTER cluster_name ) ?
| K_TABLE ( K_IF K_EXISTS ) ? full_table_name ( K_ON K_CLUSTER cluster_name ) ?
)
;
alter_query
: K_ALTER K_TABLE full_table_name ( K_ON K_CLUSTER cluster_name ) ?
alter_query_element ( COMMA alter_query_element ) *
;
alter_query_element
: K_ADD K_COLUMN compound_name_type_pair ( K_AFTER column_name ) ?
| K_DROP K_COLUMN column_name
| K_MODIFY K_COLUMN compound_name_type_pair
| K_ATTACH K_PARTITION partition_name
| K_DETACH K_PARTITION partition_name
| K_DROP K_PARTITION partition_name
| K_FETCH K_PARTITION partition_name K_FROM STRING_LITERAL
| K_FREEZE K_PARTITION partition_name
;
clickhouse_type
: simple_type
| T_AGGREGATE_FUNCTION LPAREN function_name ( COMMA clickhouse_type ) * RPAREN
| T_ARRAY LPAREN clickhouse_type RPAREN
| T_TUPLE LPAREN clickhouse_type ( COMMA clickhouse_type ) * RPAREN
| T_NULLABLE LPAREN clickhouse_type LPAREN
;
simple_type
: T_UINT8
| T_UINT16
| T_UINT32
| T_UINT64
| T_INT8
| T_INT16
| T_INT32
| T_INT64
| T_FLOAT32
| T_FLOAT64
| T_ENUM8 LPAREN enum_entry ( COMMA enum_entry ) * LPAREN
| T_ENUM16 LPAREN enum_entry ( COMMA enum_entry ) * LPAREN
| T_UUID
| T_DATE
| T_DATETIME
| T_STRING
| T_INTERVAL_YEAR
| T_INTERVAL_MONTH
| T_INTERVAL_WEEK
| T_INTERVAL_DAY
| T_INTERVAL_HOUR
| T_INTERVAL_MINUTE
| T_INTERVAL_SECOND
| T_NULL
| T_FIXEDSTRING LPAREN NUMERIC_LITERAL LPAREN
;
enum_entry
: STRING_LITERAL ASSIGN NUMERIC_LITERAL
;
use_query
: K_USE database_name
;
set_query
: K_SET K_GLOBAL? assignment_list
;
assignment_list
: assignment ( COMMA assignment ) *
;
assignment
: identifier ASSIGN literal
;
kill_query_query
: K_KILL K_QUERY K_WHERE expression_with_optional_alias ( K_SYNC | K_ASYNC | K_TEST )
;
optimize_query
: K_OPTIMIZE K_TABLE full_table_name ( K_PARTITION STRING_LITERAL ) ? K_FINAL?
;
table_properties_query
: ( K_EXISTS | ( K_DESCRIBE | K_DESC ) | K_SHOW K_CREATE ) K_TABLE full_table_name query_outfile_step? ( K_FORMAT format_name ) ?
;
show_tables_query
: K_SHOW ( K_DATABASES
| K_TABLES ( K_FROM database_name ) ? ( K_NOT? K_LIKE STRING_LITERAL ) ? )
query_outfile_step?
( K_FORMAT format_name ) ?
;
show_processlist_query
: K_SHOW K_PROCESSLIST query_outfile_step? ( K_FORMAT format_name ) ?
;
check_query
: K_CHECK K_TABLE full_table_name
;
// 2. QUERY ELEMENTS
full_table_name
: ( database_name DOT ) ? table_name
;
partition_name
: identifier | STRING_LITERAL
;
cluster_name
: identifier | STRING_LITERAL
;
database_name
: identifier
;
table_name
: identifier
;
format_name
: identifier
;
query_outfile_step
: K_INTO K_OUTFILE STRING_LITERAL
;
engine
: K_ENGINE ASSIGN identifier_with_optional_parameters
;
identifier_with_optional_parameters
: identifier_with_parameters
| identifier
;
identifier_with_parameters
: function
| nested_table
;
order_by_expression_list
: order_by_element ( COMMA order_by_element ) *
;
order_by_element
: expression_with_optional_alias ( K_DESC | K_DESCENDING | K_ASC | K_ASCENDING ) ? ( K_NULLS ( K_FIRST | K_LAST ) ) ? ( K_COLLATE STRING_LITERAL ) ?
;
nested_table
: identifier LPAREN name_type_pair_list RPAREN
;
name_type_pair_list
: name_type_pair ( COMMA name_type_pair ) *
;
name_type_pair
: identifier column_type
;
compound_name_type_pair
: compound_identifier column_type
;
column_declaration_list
: column_declaration ( COMMA column_declaration ) *
;
column_declaration
: column_name
( ( K_DEFAULT | K_MATERIALIZED | K_ALIAS ) expr
| column_type
)
;
column_name
: identifier
;
column_type
: clickhouse_type
;
column_name_list
: column_name ( COMMA column_name ) *
;
select_expr_list
: select_expr ( COMMA select_expr) *
;
select_expr
: expr select_alias?
;
select_alias
: K_AS? alias_name
;
alias
: K_AS alias_name
;
alias_name
: identifier
;
table_function
: function
;
subquery
: LPAREN select_query_main RPAREN
;
expression_with_optional_alias
: expr alias?
;
// EXPRESSIONS
expr
: LPAREN expr RPAREN # ExprParen
| function # ExprFunction
| K_CASE expr? ( K_WHEN expr K_THEN expr ) ( K_WHEN expr K_THEN expr ) * K_ELSE expr K_END # ExprCase
| expr DOT expr # ExprTupleElement
| expr LBRAKET expr RBRAKET # ExprArrayElement
| MINUS expr # ExprUnaryMinus
| K_CAST LPAREN expr K_AS clickhouse_type RPAREN # ExprCast
| expr ( STAR | DIVIDE | PERCENT ) expr # ExprMul
| expr ( PLUS | MINUS ) expr # ExprAdd
| expr CONCAT expr # ExprConcat
| expr K_BETWEEN expr K_AND expr # ExprBetween
| expr ( EQUALS | ASSIGN | NOT_EQUALS | NOT_EQUALS2 | LE | GE | LT | GT | K_LIKE | K_NOT K_LIKE ) expr # ExprLogical
| expr ( K_IN | K_NOT K_IN | K_GLOBAL K_IN | K_GLOBAL K_NOT K_IN ) expr # ExprIn
| expr ( K_IS K_NULL | K_IS K_NOT K_NULL ) # ExprIsNull
| K_INTERVAL expr interval_unit # ExprInterval
| K_NOT expr # ExprNot
| expr K_AND expr # ExprAnd
| expr K_OR expr # ExprOr
| expr QUESTION expr COLON expr # ExprTernary
| ( LPAREN identifier ( COMMA identifier )* RPAREN | identifier ( COMMA identifier )* ) RARROW expr # ExprLambda
| subquery # ExprSubquery
| LPAREN not_empty_expression_list RPAREN # ExprList
| array # ExprArray
| literal # ExprLiteral
| compound_identifier # ExprId
| STAR # ExprStar
| expr alias # ExprWithAlias
;
interval_unit
: K_YEAR
| K_MONTH
| K_WEEK
| K_DAY
| K_HOUR
| K_MINUTE
| K_SECOND
;
expression_list
: ( not_empty_expression_list )?
;
not_empty_expression_list
: expr ( COMMA expr )*
;
array
: LBRAKET expression_list RBRAKET
;
function
: function_name function_parameters? function_arguments
;
function_parameters
: LPAREN ( expr ( COMMA expr )* )? RPAREN
;
function_arguments
: LPAREN ( expr ( COMMA expr )* )? RPAREN
;
function_name
: identifier
;
identifier
: QUOTED_LITERAL
| IDENTIFIER
// в данном случае мы разрешаем ключевым словам выступать в качестве имен колонок или функций.
| simple_type
| keyword
;
keyword
: K_ADD
| K_AFTER
| K_ALL
| K_ALIAS
| K_ALTER
| K_AND
| K_ANY
| K_ARRAY
| K_AS
| K_ASCENDING
| K_ASC
| K_ASYNC
| K_ATTACH
| K_BETWEEN
| K_BY
| K_CASE
| K_CHECK
| K_COLUMN
| K_COLLATE
| K_CREATE
| K_CROSS
| K_DESCRIBE
| K_DESCENDING
| K_DESC
| K_DATABASE
| K_DATABASES
| K_DEFAULT
| K_DETACH
| K_DISTINCT
| K_DROP
| K_ENGINE
| K_ELSE
| K_END
| K_EXISTS
| K_FINAL
| K_FIRST
| K_FROM
| K_FORMAT
| K_FULL
| K_GLOBAL
| K_GROUP
| K_HAVING
| K_ID
| K_IF
| K_INNER
| K_INSERT
| K_INTO
| K_IN
| K_IS
| K_JOIN
| K_KILL
| K_LAST
| K_LEFT
| K_LIKE
| K_LIMIT
| K_MAIN
| K_MATERIALIZED
| K_MODIFY
| K_NOT
| K_NULL
| K_NULLS
| K_OFFSET
| K_ON
| K_OPTIMIZE
| K_ORDER
| K_OR
| K_OUTFILE
| K_PARTITION
| K_POPULATE
| K_PREWHERE
| K_PROCESSLIST
| K_QUERY
| K_RENAME
| K_RETURN
| K_RIGHT
| K_SAMPLE
| K_SELECT
| K_SET
| K_SETTINGS
| K_SHOW
| K_SYNC
| K_TABLE
| K_TABLES
| K_TEMPORARY
| K_TEST
| K_THEN
| K_TOTALS
| K_TO
| K_OUTER
| K_VALUES
| K_VIEW
| K_UNION
| K_USE
| K_USING
| K_WHEN
| K_WHERE
| K_WITH
;
compound_identifier
: identifier DOT identifier
| identifier
;
literal
: K_NULL
| NUMERIC_LITERAL
| STRING_LITERAL
;
error
: UNEXPECTED_CHAR
{
throw new RuntimeException("UNEXPECTED_CHAR=" + $UNEXPECTED_CHAR.text);
}
;

7
utils/grammar/README.md Normal file
View File

@ -0,0 +1,7 @@
=== ClickHouse grammar for ANTLR4
Authors: Yuriy Galitskiy (orantius, https://github.com/duremar), Sergey Serebryanik (serebrserg, https://github.com/serebrserg), Efim Pyshnograev (graev).
Initially developed for Yandex.Metrica product and published under Apache 2.0 license with permission from Yandex. It has also found its usage in DataGrip product.
It is not used in ClickHouse directly and is not synchronized with ClickHouse C++ code. Neither supported or tested. Any help welcome.