import os import json import subprocess import time from itertools import chain from os import system wait_for_loading_sleep_time_sec = 3 continue_on_error = False clickhouse_binary = 'clickhouse-client' prefix = base_dir = os.path.dirname(os.path.realpath(__file__)) generated_prefix = prefix + '/generated/' # [ name, key_type, has_parent ] dictionaries = [ # Simple key dictionaries [ 'file_flat', 0, True ], [ 'clickhouse_flat', 0, True ], [ 'mysql_flat', 0, True ], [ 'mongodb_flat', 0, True ], [ 'file_hashed', 0, True ], [ 'clickhouse_hashed', 0, True ], [ 'mysql_hashed', 0, True ], [ 'mongodb_hashed', 0, True ], [ 'clickhouse_cache', 0, True ], [ 'mysql_cache', 0, True ], [ 'mongodb_cache', 0, True ], # Complex key dictionaries with (UInt8, UInt8) key [ 'file_complex_integers_key_hashed', 1, False ], [ 'clickhouse_complex_integers_key_hashed', 1, False ], [ 'mysql_complex_integers_key_hashed', 1, False ], [ 'mongodb_complex_integers_key_hashed', 1, False ], [ 'clickhouse_complex_integers_key_cache', 1, False ], [ 'mysql_complex_integers_key_cache', 1, False ], [ 'mongodb_complex_integers_key_cache', 1, False ], # Complex key dictionaries with (String, UInt8) key [ 'file_complex_mixed_key_hashed', 2, False ], [ 'clickhouse_complex_mixed_key_hashed', 2, False ], [ 'mysql_complex_mixed_key_hashed', 2, False ], [ 'mongodb_complex_mixed_key_hashed', 2, False ], [ 'clickhouse_complex_mixed_key_cache', 2, False ], [ 'mysql_complex_mixed_key_cache', 2, False ], [ 'mongodb_complex_mixed_key_cache', 2, False ], ] files = [ 'key_simple.tsv', 'key_complex_integers.tsv', 'key_complex_mixed.tsv' ] types = [ 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Int8', 'Int16', 'Int32', 'Int64', 'Float32', 'Float64', 'String', 'Date', 'DateTime' ] explicit_defaults = [ '42', '42', '42', '42', '-42', '-42', '-42', '-42', '1.5', '1.6', "'explicit-default'", "'2015-01-01'", "'2015-01-01 00:00:00'" ] implicit_defaults = [ '1', '1', '1', '1', '-1', '-1', '-1', '-1', '2.71828', '2.71828', 'implicit-default', '2015-11-25', '2015-11-25 00:00:00' ] def call(args, out_filename): with open(out_filename, 'w') as file: subprocess.check_call(args, stdout=file) def generate_data(): def comma_separated(iterable): return ', '.join(iterable) def columns(): return map(lambda t: t + '_', types) key_columns = [ [ 'id' ], [ 'key0', 'key1' ], [ 'key0_str', 'key1' ] ] print 'Creating ClickHouse table' # create ClickHouse table via insert select system('cat source.tsv | {ch} --port 9001 -m -n --query "' 'create database if not exists test;' 'drop table if exists test.dictionary_source;' 'create table test.dictionary_source (' 'id UInt64, key0 UInt8, key0_str String, key1 UInt8,' 'UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64,' 'Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64,' 'Float32_ Float32, Float64_ Float64,' 'String_ String,' 'Date_ Date, DateTime_ DateTime, Parent UInt64' ') engine=Log; insert into test.dictionary_source format TabSeparated' '"'.format(ch=clickhouse_binary)) # generate 3 files with different key types print 'Creating .tsv files' file_source_query = 'select %s from test.dictionary_source format TabSeparated;' for file, keys in zip(files, key_columns): query = file_source_query % comma_separated(chain(keys, columns(), [ 'Parent' ] if 1 == len(keys) else [])) call([ clickhouse_binary, '--port', '9001', '--query', query ], 'generated/' + file) # create MySQL table from complete_query print 'Creating MySQL table' subprocess.check_call('echo "' 'create database if not exists test;' 'drop table if exists test.dictionary_source;' 'create table test.dictionary_source (' 'id tinyint unsigned, key0 tinyint unsigned, key0_str text, key1 tinyint unsigned, ' 'UInt8_ tinyint unsigned, UInt16_ smallint unsigned, UInt32_ int unsigned, UInt64_ bigint unsigned, ' 'Int8_ tinyint, Int16_ smallint, Int32_ int, Int64_ bigint, ' 'Float32_ float, Float64_ double, ' 'String_ text, Date_ date, DateTime_ datetime, Parent bigint unsigned' ');' 'load data local infile \'{0}/source.tsv\' into table test.dictionary_source;" | mysql --local-infile=1' .format(prefix), shell=True) # create MongoDB collection from complete_query via JSON file print 'Creating MongoDB collection' table_rows = json.loads(subprocess.check_output([ clickhouse_binary, '--port', '9001', '--query', "select * from test.dictionary_source where not ignore(" \ "concat('new Date(\\'', toString(Date_), '\\')') as Date_, " \ "concat('new Date(\\'', toString(DateTime_), '\\')') as DateTime_" \ ") format JSON" ]))['data'] # ClickHouse outputs 64-bit wide integers in double-quotes, convert them for row in table_rows: for column in [u'id', u'UInt64_', u'Int64_', u'Parent']: row[column] = int(row[column]) source_for_mongo = json.dumps(table_rows).replace(')"', ')').replace('"new', 'new') open('generated/full.json', 'w').write('db.dictionary_source.drop(); db.dictionary_source.insert(%s);' % source_for_mongo) result = system('cat generated/full.json | mongo --quiet > /dev/null') if result != 0: print 'Could not create MongoDB collection' exit(-1) def generate_dictionaries(): dictionary_skeleton = ''' {name} {source} 0 0 {layout} {key} %s {parent} ''' attribute_skeleton = ''' %s_ %s %s ''' dictionary_skeleton =\ dictionary_skeleton % reduce(lambda xml, (type, default): xml + attribute_skeleton % (type, type, default), zip(types, implicit_defaults), '') source_file = ''' %s TabSeparated ''' source_clickhouse = ''' 127.0.0.1 9001 default test dictionary_source
''' source_mysql = ''' 127.0.0.1 3306 root test dictionary_source
''' source_mongodb = ''' localhost 27017 test dictionary_source ''' layout_flat = '' layout_hashed = '' layout_cache = '128' layout_complex_key_hashed = '' layout_complex_key_cache = '128' key_simple = ''' id ''' key_complex_integers = ''' key0 UInt8 key1 UInt8 ''' key_complex_mixed = ''' key0_str String key1 UInt8 ''' keys = [ key_simple, key_complex_integers, key_complex_mixed ] parent_attribute = ''' Parent UInt64 true 0 ''' sources_and_layouts = [ # Simple key dictionaries [ source_file % (generated_prefix + files[0]), layout_flat], [ source_clickhouse, layout_flat ], [ source_mysql, layout_flat ], [ source_mongodb, layout_flat ], [ source_file % (generated_prefix + files[0]), layout_hashed], [ source_clickhouse, layout_hashed ], [ source_mysql, layout_hashed ], [ source_mongodb, layout_hashed ], [ source_clickhouse, layout_cache ], [ source_mysql, layout_cache ], [ source_mongodb, layout_cache ], # Complex key dictionaries with (UInt8, UInt8) key [ source_file % (generated_prefix + files[1]), layout_complex_key_hashed], [ source_clickhouse, layout_complex_key_hashed ], [ source_mysql, layout_complex_key_hashed ], [ source_mongodb, layout_complex_key_hashed ], [ source_clickhouse, layout_complex_key_cache ], [ source_mysql, layout_complex_key_cache ], [ source_mongodb, layout_complex_key_cache ], # Complex key dictionaries with (String, UInt8) key [ source_file % (generated_prefix + files[2]), layout_complex_key_hashed], [ source_clickhouse, layout_complex_key_hashed ], [ source_mysql, layout_complex_key_hashed ], [ source_mongodb, layout_complex_key_hashed ], [ source_clickhouse, layout_complex_key_cache ], [ source_mysql, layout_complex_key_cache ], [ source_mongodb, layout_complex_key_cache ], ] for (name, key_idx, has_parent), (source, layout) in zip(dictionaries, sources_and_layouts): filename = 'generated/dictionary_%s.xml' % name with open(filename, 'w') as file: dictionary_xml = dictionary_skeleton.format( key = keys[key_idx], parent = parent_attribute if has_parent else '', **locals()) file.write(dictionary_xml) def run_tests(): keys = [ 'toUInt64(n)', '(n, n)', '(toString(n), n)' ] dict_get_query_skeleton = "select dictGet{type}('{name}', '{type}_', {key}) array join range(8) as n;" dict_has_query_skeleton = "select dictHas('{name}', {key}) array join range(8) as n;" dict_get_or_default_query_skeleton = "select dictGet{type}OrDefault('{name}', '{type}_', {key}, to{type}({default})) array join range(8) as n;" dict_hierarchy_query_skeleton = "select dictGetHierarchy('{name}' as d, key), dictIsIn(d, key, toUInt64(1)), dictIsIn(d, key, key) array join range(toUInt64(8)) as key;" def test_query(dict, query, reference, name): result = system('{ch} --port 9001 --query "{query}" | diff - reference/{reference}.reference'.format(ch=clickhouse_binary, **locals())) if result != 0: print 'Dictionary ' + dict + ' has failed test ' + name + '\n' if not continue_on_error: exit(-1) print 'Waiting for dictionaries to load...' time.sleep(wait_for_loading_sleep_time_sec) # the actual tests for (name, key_idx, has_parent) in dictionaries: key = keys[key_idx] print 'Testing dictionary', name # query dictHas test_query(name, dict_has_query_skeleton.format(**locals()), 'has', 'dictHas') # query dictGet* for type, default in zip(types, explicit_defaults): test_query(name, dict_get_query_skeleton.format(**locals()), type, 'dictGet' + type) test_query(name, dict_get_or_default_query_skeleton.format(**locals()), type + 'OrDefault', 'dictGet' + type + 'OrDefault') # query dictGetHierarchy, dictIsIn if has_parent: test_query(name, dict_hierarchy_query_skeleton.format(**locals()), 'hierarchy', ' for dictGetHierarchy, dictIsIn') generate_data() generate_dictionaries() run_tests() print 'Done'