ClickHouse/tests/integration/test_dictionaries_all_layouts_and_sources/dictionary.py

383 lines
14 KiB
Python
Raw Normal View History

2019-05-28 20:17:30 +00:00
# -*- coding: utf-8 -*-
2019-02-21 16:43:21 +00:00
import copy
2019-02-21 12:04:08 +00:00
class Layout(object):
LAYOUTS_STR_DICT = {
'flat': '<flat/>',
'hashed': '<hashed/>',
'cache': '<cache><size_in_cells>128</size_in_cells></cache>',
2020-06-25 16:09:34 +00:00
'ssd_cache': '<ssd_cache><path>/etc/clickhouse/dictionaries/all</path><max_stored_keys>128</max_stored_keys></ssd_cache>',
2019-02-21 12:04:08 +00:00
'complex_key_hashed': '<complex_key_hashed/>',
'complex_key_hashed_one_key': '<complex_key_hashed/>',
'complex_key_hashed_two_keys': '<complex_key_hashed/>',
2019-02-21 12:04:08 +00:00
'complex_key_cache': '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>',
2020-06-25 16:09:34 +00:00
'complex_key_ssd_cache': '<complex_key_ssd_cache><path>/etc/clickhouse/dictionaries/all</path><max_stored_keys>128</max_stored_keys></complex_key_ssd_cache>',
2019-05-28 20:17:30 +00:00
'range_hashed': '<range_hashed/>',
'direct': '<direct/>',
2020-05-12 15:00:08 +00:00
'complex_key_direct': '<complex_key_direct/>'
2019-02-21 12:04:08 +00:00
}
def __init__(self, name):
self.name = name
self.is_complex = False
self.is_simple = False
self.is_ranged = False
if self.name.startswith('complex'):
2019-05-28 20:17:30 +00:00
self.layout_type = 'complex'
2019-02-21 12:04:08 +00:00
self.is_complex = True
2019-05-28 20:17:30 +00:00
elif name.startswith('range'):
self.layout_type = 'ranged'
2019-02-21 12:04:08 +00:00
self.is_ranged = True
else:
2019-05-28 20:17:30 +00:00
self.layout_type = 'simple'
2019-02-21 12:04:08 +00:00
self.is_simple = True
def get_str(self):
return self.LAYOUTS_STR_DICT[self.name]
def get_key_block_name(self):
if self.is_complex:
return 'key'
2019-05-28 20:17:30 +00:00
return 'id'
2019-02-21 12:04:08 +00:00
class Row(object):
def __init__(self, fields, values):
self.data = {}
for field, value in zip(fields, values):
self.data[field.name] = value
def has_field(self, name):
return name in self.data
2019-02-21 12:04:08 +00:00
def get_value_by_name(self, name):
return self.data[name]
2019-04-17 10:11:38 +00:00
def set_value(self, name, value):
self.data[name] = value
2019-02-21 12:04:08 +00:00
class Field(object):
2019-02-22 10:55:12 +00:00
def __init__(self, name, field_type, is_key=False, is_range_key=False, default=None, hierarchical=False, range_hash_type=None, default_value_for_get=None):
2019-02-21 12:04:08 +00:00
self.name = name
self.field_type = field_type
self.is_key = is_key
self.default = default
self.hierarchical = hierarchical
self.range_hash_type = range_hash_type
self.is_range = self.range_hash_type is not None
self.is_range_key = is_range_key
2019-02-22 10:55:12 +00:00
self.default_value_for_get = default_value_for_get
2019-02-21 12:04:08 +00:00
def get_attribute_str(self):
return '''
<attribute>
<name>{name}</name>
<type>{field_type}</type>
<null_value>{default}</null_value>
<hierarchical>{hierarchical}</hierarchical>
</attribute>'''.format(
name=self.name,
field_type=self.field_type,
default=self.default if self.default else '',
hierarchical='true' if self.hierarchical else 'false',
)
def get_simple_index_str(self):
return '<name>{name}</name>'.format(name=self.name)
def get_range_hash_str(self):
if not self.range_hash_type:
raise Exception("Field {} is not range hashed".format(self.name))
return '''
<range_{type}>
<name>{name}</name>
</range_{type}>
'''.format(type=self.range_hash_type, name=self.name)
class DictionaryStructure(object):
def __init__(self, layout, fields):
self.layout = layout
self.keys = []
self.range_key = None
self.ordinary_fields = []
self.range_fields = []
self.has_hierarchy = False
2019-02-21 12:04:08 +00:00
for field in fields:
if field.is_key:
self.keys.append(field)
elif field.is_range:
self.range_fields.append(field)
else:
self.ordinary_fields.append(field)
if field.hierarchical:
self.has_hierarchy = True
2019-02-21 12:04:08 +00:00
if field.is_range_key:
if self.range_key is not None:
raise Exception("Duplicate range key {}".format(field.name))
self.range_key = field
if not self.layout.is_complex and len(self.keys) > 1:
raise Exception("More than one key {} field in non complex layout {}".format(len(self.keys), self.layout.name))
if self.layout.is_ranged and (not self.range_key or len(self.range_fields) != 2):
raise Exception("Inconsistent configuration of ranged dictionary")
def get_structure_str(self):
fields_strs = []
for field in self.ordinary_fields:
fields_strs.append(field.get_attribute_str())
2019-04-15 01:34:10 +00:00
2019-02-21 12:04:08 +00:00
key_strs = []
if self.layout.is_complex:
for key_field in self.keys:
key_strs.append(key_field.get_attribute_str())
2019-05-28 20:17:30 +00:00
else: # same for simple and ranged
2019-02-21 12:04:08 +00:00
for key_field in self.keys:
key_strs.append(key_field.get_simple_index_str())
ranged_strs = []
if self.layout.is_ranged:
for range_field in self.range_fields:
ranged_strs.append(range_field.get_range_hash_str())
return '''
<layout>
{layout_str}
</layout>
<structure>
<{key_block_name}>
{key_str}
</{key_block_name}>
{range_strs}
2019-02-21 16:43:21 +00:00
{attributes_str}
2019-02-21 12:04:08 +00:00
</structure>'''.format(
layout_str=self.layout.get_str(),
key_block_name=self.layout.get_key_block_name(),
key_str='\n'.join(key_strs),
attributes_str='\n'.join(fields_strs),
range_strs='\n'.join(ranged_strs),
)
2019-02-21 16:43:21 +00:00
def get_ordered_names(self):
fields_strs = []
for key_field in self.keys:
fields_strs.append(key_field.name)
for range_field in self.range_fields:
fields_strs.append(range_field.name)
for field in self.ordinary_fields:
fields_strs.append(field.name)
return fields_strs
2019-02-25 10:45:22 +00:00
def get_all_fields(self):
return self.keys + self.range_fields + self.ordinary_fields
2019-02-21 16:43:21 +00:00
2019-02-22 10:55:12 +00:00
def _get_dict_get_common_expression(self, dict_name, field, row, or_default, with_type, has):
2019-02-21 12:04:08 +00:00
if field in self.keys:
raise Exception("Trying to receive key field {} from dictionary".format(field.name))
if not self.layout.is_complex:
2019-02-22 10:55:12 +00:00
if not or_default:
key_expr = ', toUInt64({})'.format(row.data[self.keys[0].name])
else:
key_expr = ', toUInt64({})'.format(self.keys[0].default_value_for_get)
2019-02-21 12:04:08 +00:00
else:
key_exprs_strs = []
for key in self.keys:
2019-02-22 10:55:12 +00:00
if not or_default:
val = row.data[key.name]
else:
val = key.default_value_for_get
2019-02-21 12:04:08 +00:00
if isinstance(val, str):
val = "'" + val + "'"
key_exprs_strs.append('to{type}({value})'.format(type=key.field_type, value=val))
2019-05-30 19:44:40 +00:00
key_expr = ', tuple(' + ','.join(key_exprs_strs) + ')'
2019-02-21 12:04:08 +00:00
date_expr = ''
if self.layout.is_ranged:
val = row.data[self.range_key.name]
if isinstance(val, str):
val = "'" + val + "'"
val = "to{type}({val})".format(type=self.range_key.field_type, val=val)
date_expr = ', ' + val
2019-02-22 10:55:12 +00:00
if or_default:
raise Exception("Can create 'dictGetOrDefault' query for ranged dictionary")
if or_default:
or_default_expr = 'OrDefault'
if field.default_value_for_get is None:
raise Exception("Can create 'dictGetOrDefault' query for field {} without default_value_for_get".format(field.name))
val = field.default_value_for_get
if isinstance(val, str):
val = "'" + val + "'"
default_value_for_get = ', to{type}({value})'.format(type=field.field_type, value=val)
else:
or_default_expr = ''
default_value_for_get = ''
if with_type:
field_type = field.field_type
else:
field_type = ''
field_name = ", '" + field.name + "'"
if has:
what = "Has"
field_type = ''
or_default = ''
field_name = ''
date_expr = ''
def_for_get = ''
else:
what = "Get"
return "dict{what}{field_type}{or_default}('{dict_name}'{field_name}{key_expr}{date_expr}{def_for_get})".format(
what=what,
field_type=field_type,
2019-02-21 12:04:08 +00:00
dict_name=dict_name,
2019-02-22 10:55:12 +00:00
field_name=field_name,
2019-02-21 12:04:08 +00:00
key_expr=key_expr,
date_expr=date_expr,
2019-02-22 10:55:12 +00:00
or_default=or_default_expr,
def_for_get=default_value_for_get,
2019-02-21 12:04:08 +00:00
)
2019-02-22 10:55:12 +00:00
def get_get_expressions(self, dict_name, field, row):
return [
self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=False, has=False),
self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=True, has=False),
]
def get_get_or_default_expressions(self, dict_name, field, row):
if not self.layout.is_ranged:
return [
self._get_dict_get_common_expression(dict_name, field, row, or_default=True, with_type=False, has=False),
self._get_dict_get_common_expression(dict_name, field, row, or_default=True, with_type=True, has=False),
]
return []
def get_has_expressions(self, dict_name, field, row):
if not self.layout.is_ranged:
return [self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=False, has=True)]
return []
2019-02-24 17:45:10 +00:00
def get_hierarchical_expressions(self, dict_name, row):
if self.layout.is_simple:
key_expr = 'toUInt64({})'.format(row.data[self.keys[0].name])
return [
"dictGetHierarchy('{dict_name}', {key})".format(
dict_name=dict_name,
key=key_expr,
),
]
return []
def get_is_in_expressions(self, dict_name, row, parent_row):
if self.layout.is_simple:
child_key_expr = 'toUInt64({})'.format(row.data[self.keys[0].name])
parent_key_expr = 'toUInt64({})'.format(parent_row.data[self.keys[0].name])
return [
"dictIsIn('{dict_name}', {child_key}, {parent_key})".format(
dict_name=dict_name,
child_key=child_key_expr,
parent_key=parent_key_expr,)
]
return []
2019-02-21 12:04:08 +00:00
class Dictionary(object):
def __init__(self, name, structure, source, config_path,
table_name, fields, min_lifetime=3, max_lifetime=5):
2019-02-21 12:04:08 +00:00
self.name = name
2019-02-21 16:43:21 +00:00
self.structure = copy.deepcopy(structure)
self.source = copy.deepcopy(source)
2019-02-21 12:04:08 +00:00
self.config_path = config_path
self.table_name = table_name
2019-05-28 20:17:30 +00:00
self.fields = fields
self.min_lifetime = min_lifetime
self.max_lifetime = max_lifetime
2019-02-21 12:04:08 +00:00
def generate_config(self):
with open(self.config_path, 'w') as result:
2020-05-12 15:00:08 +00:00
if 'direct' not in self.structure.layout.get_str():
result.write('''
<yandex>
<dictionary>
<lifetime>
<min>{min_lifetime}</min>
<max>{max_lifetime}</max>
</lifetime>
<name>{name}</name>
{structure}
<source>
{source}
</source>
</dictionary>
</yandex>
'''.format(
min_lifetime=self.min_lifetime,
max_lifetime=self.max_lifetime,
name=self.name,
structure=self.structure.get_structure_str(),
source=self.source.get_source_str(self.table_name),
))
else:
result.write('''
<yandex>
<dictionary>
<name>{name}</name>
{structure}
<source>
{source}
</source>
</dictionary>
</yandex>
'''.format(
min_lifetime=self.min_lifetime,
max_lifetime=self.max_lifetime,
name=self.name,
structure=self.structure.get_structure_str(),
source=self.source.get_source_str(self.table_name),
))
2019-02-21 12:04:08 +00:00
2019-02-21 16:43:21 +00:00
def prepare_source(self, cluster):
self.source.prepare(self.structure, self.table_name, cluster)
2019-02-21 12:04:08 +00:00
def load_data(self, data):
if not self.source.prepared:
raise Exception("Cannot load data for dictionary {}, source is not prepared".format(self.name))
self.source.load_data(data, self.table_name)
2019-02-22 10:55:12 +00:00
def get_select_get_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_get_expressions(self.name, field, row)]
def get_select_get_or_default_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_get_or_default_expressions(self.name, field, row)]
def get_select_has_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_has_expressions(self.name, field, row)]
2019-02-21 12:04:08 +00:00
2019-02-24 17:45:10 +00:00
def get_hierarchical_queries(self, row):
return ['select {}'.format(expr) for expr in self.structure.get_hierarchical_expressions(self.name, row)]
def get_is_in_queries(self, row, parent_row):
return ['select {}'.format(expr) for expr in self.structure.get_is_in_expressions(self.name, row, parent_row)]
2019-02-21 12:04:08 +00:00
def is_complex(self):
return self.structure.layout.is_complex
def get_fields(self):
return self.fields