#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging import os from itertools import chain from enum import Enum from hashlib import md5 from functools import reduce import sqlglot from sqlglot.expressions import PrimaryKeyColumnConstraint, ColumnDef from exceptions import ( Error, ProgramError, ErrorWithParent, DataResultDiffer, QueryExecutionError, ) logger = logging.getLogger("parser") logger.setLevel(logging.DEBUG) CONDITION_SKIP = "skipif" CONDITION_ONLY = "onlyif" # TODO replace assertions with raise exception class TestFileFormatException(Error): pass class FileAndPos: def __init__(self, file=None, pos=None): self.file = file self.pos = pos def __str__(self): return f"{self.file}:{self.pos}" def check_conditions(conditions, dbms_name): rules = {} for rec in conditions: key, val = rec if key not in conditions: rules[key] = [] rules[key].append(val) if CONDITION_SKIP in rules: if dbms_name in rules[CONDITION_SKIP]: return False if CONDITION_ONLY in rules: if dbms_name not in rules[CONDITION_ONLY]: return False return True class BlockType(Enum): comments = 1 control = 2 statement = 3 query = 4 COMMENT_TOKENS = ["#"] RESULT_SEPARATION_LINE = "----" CONTROL_TOKENS = ["halt", "hash-threshold"] CONDITIONS_TOKENS = [CONDITION_SKIP, CONDITION_ONLY] STATEMENT_TOKEN = "statement" QUERY_TOKEN = "query" ACCEPTABLE_TYPES = {type(""): "T", type(1): "I", type(0.001): "R"} def _is_comment_line(tokens): return tokens and tokens[0][0] in COMMENT_TOKENS def _is_separation_line(tokens): return tokens and tokens[0] == RESULT_SEPARATION_LINE def _is_control_line(tokens): return tokens and tokens[0] in CONTROL_TOKENS def _is_conditional_line(tokens): return tokens and tokens[0] in CONDITIONS_TOKENS def _is_statement_line(tokens): return tokens and tokens[0] == STATEMENT_TOKEN def _is_query_line(tokens): return tokens and tokens[0] == QUERY_TOKEN class FileBlockBase: def __init__(self, parser, start, end): self._parser = parser self._start = start self._end = end def get_block_type(self): pass def get_pos(self): return self._start + 1 @staticmethod def __parse_request(test_file, start, end): request_end = start while request_end < end: tokens = test_file.get_tokens(request_end) if not tokens or _is_separation_line(tokens): break request_end += 1 request = test_file.get_tokens_from_lines(start, request_end) logger.debug("slice request %s:%s end %s", start, request_end, end) return " ".join(request), request_end @staticmethod def __parse_result(test_file, start, end): result_end = start while result_end < end: tokens = test_file.get_tokens(result_end) if not tokens: break result_end += 1 logger.debug("slice result %s:%s end %s", start, result_end, end) result = test_file.get_tokens(start, result_end) return result, result_end @staticmethod def convert_request(sql): if sql.startswith("CREATE TABLE"): result = sqlglot.transpile(sql, read="sqlite", write="clickhouse")[0] pk_token = sqlglot.parse_one(result, read="clickhouse").find( PrimaryKeyColumnConstraint ) pk_string = "tuple()" if pk_token is not None: pk_string = str(pk_token.find_ancestor(ColumnDef).args["this"]) result += " ENGINE = MergeTree() ORDER BY " + pk_string return result elif "SELECT" in sql and "CAST" in sql and "NULL" in sql: # convert `CAST (NULL as INTEGER)` to `CAST (NULL as Nullable(Int32))` try: ast = sqlglot.parse_one(sql, read="sqlite") except sqlglot.errors.ParseError as err: logger.info("cannot parse %s , error is %s", sql, err) return sql cast = ast.find(sqlglot.expressions.Cast) # logger.info("found sql %s && %s && %s", sql, cast.sql(), cast.to.args) if ( cast is not None and cast.name == "NULL" and ("nested" not in cast.to.args or not cast.to.args["nested"]) ): cast.args["to"] = sqlglot.expressions.DataType.build( "NULLABLE", expressions=[cast.to] ) new_sql = ast.sql("clickhouse") # logger.info("convert from %s to %s", sql, new_sql) return new_sql return sql @staticmethod def parse_block(parser, start, end): file_pos = FileAndPos(parser.get_test_name(), start + 1) logger.debug("%s start %s end %s", file_pos, start, end) block_type = BlockType.comments conditions = [] controls = [] statement = None query = None request = [] result_line = None result = [] line = start while line < end: tokens = parser.get_tokens(line) if _is_comment_line(tokens): pass elif _is_conditional_line(tokens): conditions.append(parser.get_tokens(line)) elif _is_control_line(tokens): assert block_type in (BlockType.comments, BlockType.control) block_type = BlockType.control controls.append(parser.get_tokens(line)) elif _is_statement_line(tokens): assert block_type in (BlockType.comments,) block_type = BlockType.statement statement = parser.get_tokens(line) request, last_line = FileBlockBase.__parse_request( parser, line + 1, end ) if parser.dbms_name == "ClickHouse": request = FileBlockBase.convert_request(request) assert last_line == end line = last_line elif _is_query_line(tokens): assert block_type in (BlockType.comments,) block_type = BlockType.query query = parser.get_tokens(line) request, last_line = FileBlockBase.__parse_request( parser, line + 1, end ) if parser.dbms_name == "ClickHouse": request = FileBlockBase.convert_request(request) result_line = last_line line = last_line if line == end: break tokens = parser.get_tokens(line) assert _is_separation_line(tokens), f"last_line {last_line}, end {end}" result, last_line = FileBlockBase.__parse_result(parser, line + 1, end) assert last_line == end line = last_line line += 1 if block_type == BlockType.comments: return FileBlockComments(parser, start, end) if block_type == BlockType.control: return FileBlockControl(parser, start, end, conditions, controls) if block_type == BlockType.statement: return FileBlockStatement( parser, start, end, conditions, statement, request ) if block_type == BlockType.query: block = FileBlockQuery( parser, start, end, conditions, query, request, result_line ) block.with_result(result) return block def dump_to(self, output): if output is None: return for line in range(self._start, self._end): output.write(self._parser.get_line(line)) output.write("\n") class FileBlockComments(FileBlockBase): def __init__(self, parser, start, end): super().__init__(parser, start, end) def get_block_type(self): return BlockType.comments class FileBlockControl(FileBlockBase): def __init__(self, parser, start, end, conditions, control): super().__init__(parser, start, end) self.conditions = conditions self.control = control def get_block_type(self): return BlockType.control def get_conditions(self): return self.conditions class FileBlockStatement(FileBlockBase): def __init__(self, parser, start, end, conditions, statement, request): super().__init__(parser, start, end) self.conditions = conditions self.statement = statement self.request = request def get_block_type(self): return BlockType.statement def get_request(self): return self.request def get_conditions(self): return self.conditions def get_statement(self): return self.statement def expected_error(self): return self.statement[1] == "error" class FileBlockQuery(FileBlockBase): def __init__(self, parser, start, end, conditions, query, request, result_line): super().__init__(parser, start, end) self.conditions = conditions self.query = query self.request = request self.result = None self.result_line = result_line def get_block_type(self): return BlockType.query def get_request(self): return self.request def get_conditions(self): return self.conditions def get_query(self): return self.query def expected_error(self): return " ".join(self.query[2:]).lower() if self.query[1] == "error" else None def get_types(self): if self.query[1] == "error": raise TestFileFormatException( "the query is expected to fail, there are no types" ) return self.query[1] def get_sort_mode(self): return self.query[2] def get_result(self): return self.result def with_result(self, result): self.result = result def dump_to(self, output): if output is None: return for line in range(self._start, self.result_line): output.write(self._parser.get_line(line)) if self.result is not None: logger.debug("dump result %s", self.result) output.write("----\n") for row in self.result: output.write(" ".join(row) + "\n") output.write("\n") class TestFileParser: CONTROL_TOKENS = ["halt", "hash-threshold"] CONDITIONS_TOKENS = [CONDITION_SKIP, CONDITION_ONLY] STATEMENT_TOKEN = "statement" QUERY_TOKEN = "query" COMMENT_TOKEN = "#" DEFAULT_HASH_THRESHOLD = 8 def __init__(self, stream, test_name, test_file, dbms_name): self._stream = stream self._test_name = test_name self._test_file = test_file self.dbms_name = dbms_name self._lines = [] self._raw_tokens = [] self._tokens = [] self._empty_lines = [] def get_test_name(self): return self._test_name def get_test_file(self): if self._test_file is not None: return self._test_file return self._test_name def get_line(self, line): return self._lines[line] def get_tokens(self, start, end=None): if end is None: return self._tokens[start] else: return self._tokens[start:end] def get_tokens_from_lines(self, start, end): return list(chain(*self._tokens[start:end])) def __load_file(self): self._lines = self._stream.readlines() self._raw_tokens = [line.split() for line in self._lines] assert len(self._lines) == len(self._raw_tokens) self._tokens = [] for line in self._raw_tokens: if self.COMMENT_TOKEN in line: comment_starts_at = line.index(self.COMMENT_TOKEN) self._tokens.append(line[0:comment_starts_at]) else: self._tokens.append(line) self._empty_lines = [i for i, x in enumerate(self._raw_tokens) if len(x) == 0] logger.debug( "Test file %s loaded rows %s, empty rows %s", self.get_test_file(), len(self._lines), len(self._empty_lines), ) def __unload_file(self): self._test_file = None self._test_name = None self._stream = None self._lines = [] self._raw_tokens = [] self._tokens = [] self._empty_lines = [] def _iterate_blocks(self): prev = 0 for i in self._empty_lines: if prev != i: yield FileBlockBase.parse_block(self, prev, i) prev = i + 1 if prev != len(self._lines): yield FileBlockBase.parse_block(self, prev, len(self._lines)) def test_blocks(self): try: self.__load_file() yield from self._iterate_blocks() finally: self.__unload_file() class QueryResult: def __init__( self, rows=None, values_count=None, data_hash=None, exception=None, hash_threshold=0, ): self.rows = rows self.values_count = values_count self.data_hash = data_hash self.exception = exception self.hash_threshold = hash_threshold self.hash_it() logger.debug("created QueryResult %s", str(self)) def __str__(self): params = ", ".join( ( str(x) for x in [ "rows: {}".format(self.rows) if self.rows else "", "values_count: {}".format(self.values_count) if self.values_count else "", "data_hash: {}".format(self.data_hash) if self.data_hash else "", "exception: {}".format(self.exception) if self.exception else "", "hash_threshold: {}".format(self.hash_threshold) if self.hash_threshold else "", ] if x ) ) return "QueryResult({})".format(params) def __iter__(self): if self.rows is not None: if self.hash_threshold == 0: return iter(self.rows) if self.values_count <= self.hash_threshold: return iter(self.rows) if self.data_hash is not None: return iter( [["{} values hashing to {}".format(self.values_count, self.data_hash)]] ) if self.exception is not None: return iter([["exception: {}".format(self.exception)]]) raise ProgramError("Query result is empty", details="{}".format(self.__str__())) @staticmethod def __value_count(rows): return reduce(lambda a, b: a + len(b), rows, 0) @staticmethod def parse_it(rows, hash_threshold): logger.debug("parse result len: %s rows: %s", len(rows), rows) if len(rows) == 1: logger.debug("one row is %s", rows) if len(rows[0]) > 0 and rows[0][0] == "exception:": logging.debug("as exception") message = " ".join(rows[0][1:]) return QueryResult(exception=message) if len(rows[0]) == 5 and " ".join(rows[0][1:4]) == "values hashing to": logging.debug("as hashed data") values_count = int(rows[0][0]) data_hash = rows[0][4] return QueryResult(data_hash=data_hash, values_count=values_count) logger.debug("as data") values_count = QueryResult.__value_count(rows) return QueryResult( rows=rows, values_count=values_count, hash_threshold=hash_threshold ) @staticmethod def __result_as_strings(rows, types): res = [] for row in rows: res_row = [] for c, t in zip(row, types): logger.debug(f"Builging row. c:{c} t:{t}") if c is None: res_row.append("NULL") continue if t == "T": if c == "": res_row.append("(empty)") else: res_row.append(str(c)) elif t == "I": try: res_row.append(str(int(c))) except ValueError as ex: # raise QueryExecutionError( # f"Got non-integer result '{c}' for I type." # ) res_row.append(str(int(0))) except OverflowError as ex: raise QueryExecutionError( f"Got overflowed result '{c}' for I type." ) elif t == "R": res_row.append(f"{c:.3f}") res.append(res_row) return res @staticmethod def __sort_result(rows, sort_mode): if sort_mode == "nosort": return rows if sort_mode == "rowsort": return sorted(rows) if sort_mode == "valuesort": values = list(chain(*rows)) values.sort() return [values] if values else [] @staticmethod def __calculate_hash(rows): md5_hash = md5() for row in rows: for value in row: md5_hash.update(value.encode("ascii")) return str(md5_hash.hexdigest()) @staticmethod def make_it(rows, types, sort_mode, hash_threshold): values_count = QueryResult.__value_count(rows) as_string = QueryResult.__result_as_strings(rows, types) as_sorted = QueryResult.__sort_result(as_string, sort_mode) return QueryResult( rows=as_sorted, values_count=values_count, hash_threshold=hash_threshold ) def hash_it(self): if self.rows is not None and self.data_hash is None: self.data_hash = QueryResult.__calculate_hash(self.rows) return self @staticmethod def as_exception(e): # do not print details to the test file # but print original exception if isinstance(e, ErrorWithParent): message = "{}, original is: {}".format(e, e.get_parent()) else: message = "{}".format(e) return QueryResult(exception=message) @staticmethod def assert_eq(canonic, actual): if not isinstance(canonic, QueryResult): raise ProgramError("NotImplemented") if not isinstance(actual, QueryResult): raise ProgramError("NotImplemented") if canonic.exception is not None or actual.exception is not None: if canonic.exception is not None and actual.exception is not None: if canonic.exception != actual.exception: raise DataResultDiffer( "canonic and actual results have different exceptions", details=f"canonic: {canonic.exception}, actual: {actual.exception}", ) else: # exceptions are the same return elif canonic.exception is not None: raise DataResultDiffer( "canonic result has exception and actual result doesn't", details=f"canonic: {canonic.exception}", ) else: raise DataResultDiffer( "actual result has exception and canonic result doesn't", details=f"actual: {actual.exception}", ) canonic.hash_it() actual.hash_it() if canonic.data_hash is not None: if actual.data_hash is None: raise ProgramError("actual result has to have hash for data") if canonic.values_count != actual.values_count: raise DataResultDiffer( "canonic and actual results have different value count", details="canonic values count {}, actual {}".format( canonic.values_count, actual.values_count ), ) if canonic.data_hash != actual.data_hash: raise DataResultDiffer( "canonic and actual results have different hashes" ) return if canonic.rows is not None and actual.rows is not None: if canonic.values_count != actual.values_count: raise DataResultDiffer( "canonic and actual results have different value count", details="canonic values count {}, actual {}".format( canonic.values_count, actual.values_count ), ) if canonic.rows != actual.rows: raise DataResultDiffer( "canonic and actual results have different values" ) return raise ProgramError( "Unable to compare results", details="actual {}, canonic {}".format(actual, canonic), )