Removed obsolete scripts

This commit is contained in:
Alexey Milovidov 2019-04-22 19:10:49 +03:00
parent e5ca222129
commit 58b26b4279
13 changed files with 0 additions and 1331 deletions

View File

@ -1,262 +0,0 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
import argparse
import tempfile
import random
import subprocess
import bisect
from copy import deepcopy
# Псевдослучайный генератор уникальных чисел.
# http://preshing.com/20121224/how-to-generate-a-sequence-of-unique-random-integers/
class UniqueRandomGenerator:
prime = 4294967291
def __init__(self, seed_base, seed_offset):
self.index = self.permutePQR(self.permutePQR(seed_base) + 0x682f0161)
self.intermediate_offset = self.permutePQR(self.permutePQR(seed_offset) + 0x46790905)
def next(self):
val = self.permutePQR((self.permutePQR(self.index) + self.intermediate_offset) ^ 0x5bf03635)
self.index = self.index + 1
return val
def permutePQR(self, x):
if x >=self.prime:
return x
else:
residue = (x * x) % self.prime
if x <= self.prime/2:
return residue
else:
return self.prime - residue
# Создать таблицу содержащую уникальные значения.
def generate_data_source(host, port, http_port, min_cardinality, max_cardinality, count):
chunk_size = round((max_cardinality - min_cardinality) / float(count))
used_values = 0
cur_count = 0
next_size = 0
sup = 32768
n1 = random.randrange(0, sup)
n2 = random.randrange(0, sup)
urng = UniqueRandomGenerator(n1, n2)
is_first = True
with tempfile.TemporaryDirectory() as tmp_dir:
filename = tmp_dir + '/table.txt'
with open(filename, 'w+b') as file_handle:
while cur_count < count:
if is_first == True:
is_first = False
if min_cardinality != 0:
next_size = min_cardinality + 1
else:
next_size = chunk_size
else:
next_size += chunk_size
while used_values < next_size:
h = urng.next()
used_values = used_values + 1
out = str(h) + "\t" + str(cur_count) + "\n";
file_handle.write(bytes(out, 'UTF-8'));
cur_count = cur_count + 1
query = "DROP TABLE IF EXISTS data_source"
subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
query = "CREATE TABLE data_source(UserID UInt64, KeyID UInt64) ENGINE=TinyLog"
subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
cat = subprocess.Popen(("cat", filename), stdout=subprocess.PIPE)
subprocess.check_output(("POST", "http://{0}:{1}/?query=INSERT INTO data_source FORMAT TabSeparated".format(host, http_port)), stdin=cat.stdout)
cat.wait()
def perform_query(host, port):
query = "SELECT runningAccumulate(uniqExactState(UserID)) AS exact, "
query += "runningAccumulate(uniqCombinedRawState(UserID)) AS approx "
query += "FROM data_source GROUP BY KeyID"
return subprocess.check_output(["clickhouse-client", "--host", host, "--port", port, "--query", query])
def parse_clickhouse_response(response):
parsed = []
lines = response.decode().split("\n")
for cur_line in lines:
rows = cur_line.split("\t")
if len(rows) == 2:
parsed.append([float(rows[0]), float(rows[1])])
return parsed
def accumulate_data(accumulated_data, data):
if not accumulated_data:
accumulated_data = deepcopy(data)
else:
for row1, row2 in zip(accumulated_data, data):
row1[1] += row2[1];
return accumulated_data
def generate_raw_result(accumulated_data, count):
expected_tab = []
bias_tab = []
for row in accumulated_data:
exact = row[0]
expected = row[1] / count
bias = expected - exact
expected_tab.append(expected)
bias_tab.append(bias)
return [ expected_tab, bias_tab ]
def generate_sample(raw_estimates, biases, n_samples):
result = []
min_card = raw_estimates[0]
max_card = raw_estimates[len(raw_estimates) - 1]
step = (max_card - min_card) / (n_samples - 1)
for i in range(0, n_samples + 1):
x = min_card + i * step
j = bisect.bisect_left(raw_estimates, x)
if j == len(raw_estimates):
result.append((raw_estimates[j - 1], biases[j - 1]))
elif raw_estimates[j] == x:
result.append((raw_estimates[j], biases[j]))
else:
# Найти 6 ближайших соседей. Вычислить среднее арифметическое.
# 6 точек слева x [j-6 j-5 j-4 j-3 j-2 j-1]
begin = max(j - 6, 0) - 1
end = j - 1
T = []
for k in range(end, begin, -1):
T.append(x - raw_estimates[k])
# 6 точек справа x [j j+1 j+2 j+3 j+4 j+5]
begin = j
end = min(j + 5, len(raw_estimates) - 1) + 1
U = []
for k in range(begin, end):
U.append(raw_estimates[k] - x)
# Сливаем расстояния.
V = []
lim = min(len(T), len(U))
k1 = 0
k2 = 0
while k1 < lim and k2 < lim:
if T[k1] == U[k2]:
V.append(j - k1 - 1)
V.append(j + k2)
k1 = k1 + 1
k2 = k2 + 1
elif T[k1] < U[k2]:
V.append(j - k1 - 1)
k1 = k1 + 1
else:
V.append(j + k2)
k2 = k2 + 1
if k1 < len(T):
while k1 < len(T):
V.append(j - k1 - 1)
k1 = k1 + 1
elif k2 < len(U):
while k2 < len(U):
V.append(j + k2)
k2 = k2 + 1
# Выбираем 6 ближайших точек.
# Вычисляем средние.
begin = 0
end = min(len(V), 6)
sum = 0
bias = 0
for k in range(begin, end):
sum += raw_estimates[V[k]]
bias += biases[V[k]]
sum /= float(end)
bias /= float(end)
result.append((sum, bias))
# Пропустить последовательные результаты, чьи оценки одинаковые.
final_result = []
last = -1
for entry in result:
if entry[0] != last:
final_result.append((entry[0], entry[1]))
last = entry[0]
return final_result
def dump_arrays(data):
print("Size of each array: {0}\n".format(len(data)))
is_first = True
sep = ''
print("raw_estimates = ")
print("{")
for row in data:
print("\t{0}{1}".format(sep, row[0]))
if is_first == True:
is_first = False
sep = ","
print("};")
is_first = True
sep = ""
print("\nbiases = ")
print("{")
for row in data:
print("\t{0}{1}".format(sep, row[1]))
if is_first == True:
is_first = False
sep = ","
print("};")
def start():
parser = argparse.ArgumentParser(description = "Generate bias correction tables for HyperLogLog-based functions.")
parser.add_argument("-x", "--host", default="localhost", help="ClickHouse server host name");
parser.add_argument("-p", "--port", type=int, default=9000, help="ClickHouse server TCP port");
parser.add_argument("-t", "--http_port", type=int, default=8123, help="ClickHouse server HTTP port");
parser.add_argument("-i", "--iterations", type=int, default=5000, help="number of iterations");
parser.add_argument("-m", "--min_cardinality", type=int, default=16384, help="minimal cardinality");
parser.add_argument("-M", "--max_cardinality", type=int, default=655360, help="maximal cardinality");
parser.add_argument("-s", "--samples", type=int, default=200, help="number of sampled values");
args = parser.parse_args()
accumulated_data = []
for i in range(0, args.iterations):
print(i + 1)
sys.stdout.flush()
generate_data_source(args.host, str(args.port), str(args.http_port), args.min_cardinality, args.max_cardinality, 1000)
response = perform_query(args.host, str(args.port))
data = parse_clickhouse_response(response)
accumulated_data = accumulate_data(accumulated_data, data)
result = generate_raw_result(accumulated_data, args.iterations)
sampled_data = generate_sample(result[0], result[1], args.samples)
dump_arrays(sampled_data)
if __name__ == "__main__": start()

View File

@ -1 +0,0 @@
Hits table generator based on LSTM neural network trained on real hits. You need to have weights for model or train model on real hits to generate data.

View File

@ -1,22 +0,0 @@
import argparse
from model import Model
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n', type=int, default=100000,
help='number of objects to generate')
parser.add_argument('--output_file', type=str, default='out.tsv',
help='output file name')
parser.add_argument('--weights_path', type=str,
help='path to weights')
args = parser.parse_args()
if __name__ == '__main__':
if not args.weights_path:
raise Exception('please specify path to model weights with --weights_path')
gen = Model()
gen.generate(args.n, args.output_file, args.weights_path)

View File

@ -1,147 +0,0 @@
import numpy as np
import os
import pickle
import tensorflow as tf
from random import sample
from keras.layers import Dense, Embedding
from tqdm import tqdm
RNN_NUM_UNITS = 256
EMB_SIZE = 32
MAX_LENGTH = 1049
with open('tokens', 'rb') as f:
tokens = pickle.load(f)
n_tokens = len(tokens)
token_to_id = {c: i for i, c in enumerate(tokens)}
def to_matrix(objects, max_len=None, pad=0, dtype='int32'):
max_len = max_len or max(map(len, objects))
matrix = np.zeros([len(objects), max_len], dtype) + pad
for i in range(len(objects)):
name_ix = list(map(token_to_id.get, objects[i]))
matrix[i, :len(name_ix)] = name_ix
return matrix.T
class Model:
def __init__(self, learning_rate=0.0001):
# an embedding layer that converts character ids into embeddings
self.embed_x = Embedding(n_tokens, EMB_SIZE)
get_h_next = Dense(1024, activation='relu')
# a dense layer that maps current hidden state
# to probabilities of characters [h_t+1]->P(x_t+1|h_t+1)
self.get_probas = Dense(n_tokens, activation='softmax')
self.input_sequence = tf.placeholder('int32', (MAX_LENGTH, None))
batch_size = tf.shape(self.input_sequence)[1]
self.gru_cell_first = tf.nn.rnn_cell.GRUCell(RNN_NUM_UNITS)
self.lstm_cell_second = tf.nn.rnn_cell.LSTMCell(RNN_NUM_UNITS)
h_prev_first = self.gru_cell_first.zero_state(batch_size, dtype=tf.float32)
h_prev_second = tf.nn.rnn_cell.LSTMStateTuple(
tf.zeros([batch_size, RNN_NUM_UNITS]), # initial cell state,
tf.zeros([batch_size, RNN_NUM_UNITS]) # initial hidden state
)
predicted_probas = []
for t in range(MAX_LENGTH):
x_t = self.input_sequence[t]
# convert character id into embedding
x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]
out_next_first, h_next_first = self.gru_cell_first(x_t_emb, h_prev_first)
h_prev_first = h_next_first
out_next_second, h_next_second = self.lstm_cell_second(out_next_first, h_prev_second)
h_prev_second = h_next_second
probas_next = self.get_probas(out_next_second)
predicted_probas.append(probas_next)
predicted_probas = tf.stack(predicted_probas)
predictions_matrix = tf.reshape(predicted_probas[:-1], [-1, len(tokens)])
answers_matrix = tf.one_hot(tf.reshape(self.input_sequence[1:], [-1]), n_tokens)
self.loss = tf.reduce_mean(tf.reduce_sum(
-answers_matrix * tf.log(tf.clip_by_value(predictions_matrix, 1e-7, 1.0)),
reduction_indices=[1]
))
optimizer = tf.train.AdamOptimizer(learning_rate)
gvs = optimizer.compute_gradients(self.loss)
capped_gvs = [(gr if gr is None else tf.clip_by_value(gr, -1., 1.), var) for gr, var in gvs]
self.optimize = optimizer.apply_gradients(capped_gvs)
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
def train(self, train_data_path, save_dir, num_iters, batch_size=64, restore_from=False):
history = []
if restore_from:
with open(restore_from + '_history') as f:
history = pickle.load(f)
self.saver.restore(self.sess, restore_from)
with open(train_data_path, 'r') as f:
train_data = f.readlines()
train_data = filter(lambda a: len(a) < MAX_LENGTH, train_data)
for i in tqdm(range(num_iters)):
batch = to_matrix(
map(lambda a: '\n' + a.rstrip('\n'), sample(train_data, batch_size)),
max_len=MAX_LENGTH
)
loss_i, _ = self.sess.run([self.loss, self.optimize], {self.input_sequence: batch})
history.append(loss_i)
if len(history) % 2000 == 0:
self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
with open(os.path.join(save_dir, '{}_iters_history'.format(len(history)))) as f:
pickle.dump(history, f)
def generate(self, num_objects, output_file, weights_path):
self.saver.restore(self.sess, weights_path)
batch_size = num_objects
x_t = tf.placeholder('int32', (None, batch_size))
h_t_first = tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
h_t_second = tf.nn.rnn_cell.LSTMStateTuple(
tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])),
tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
)
x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]
first_out_next, next_h_first = self.gru_cell_first(x_t_emb, h_t_first)
second_out_next, next_h_second = self.lstm_cell_second(first_out_next, h_t_second)
next_probs = self.get_probas(second_out_next)
x_sequence = np.zeros(shape=(1, batch_size), dtype=int) + token_to_id['\n']
self.sess.run(
[tf.assign(h_t_first, h_t_first.initial_value),
tf.assign(h_t_second[0], h_t_second[0].initial_value),
tf.assign(h_t_second[1], h_t_second[1].initial_value)]
)
for i in tqdm(range(MAX_LENGTH - 1)):
x_probs, _, _, _ = self.sess.run(
[next_probs,
tf.assign(h_t_second[0], next_h_second[0]),
tf.assign(h_t_second[1], next_h_second[1]),
tf.assign(h_t_first, next_h_first)],
{x_t: [x_sequence[-1, :]]}
)
next_char = [np.random.choice(n_tokens, p=x_probs[i]) for i in range(batch_size)]
if sum(next_char) == 0:
break
x_sequence = np.append(x_sequence, [next_char], axis=0)
with open(output_file, 'w') as f:
f.writelines([''.join([tokens[ix] for ix in x_sequence.T[k]]) + '\n' for k in range(batch_size)])

View File

@ -1,3 +0,0 @@
Keras==2.0.6
numpy
tensorflow-gpu==1.4.0

View File

@ -1,506 +0,0 @@
(lp0
S'\x83'
p1
aS'\x04'
p2
aS'\x87'
p3
aS'\x8b'
p4
aS'\x8f'
p5
aS'\x10'
p6
aS'\x93'
p7
aS'\x14'
p8
aS'\x97'
p9
aS'\x18'
p10
aS'\x9b'
p11
aS'\x1c'
p12
aS'\x9f'
p13
aS' '
p14
aS'\xa3'
p15
aS'$'
p16
aS'\xa7'
p17
aS'('
p18
aS'\xab'
p19
aS','
p20
aS'\xaf'
p21
aS'0'
p22
aS'\xb3'
p23
aS'4'
p24
aS'\xb7'
p25
aS'8'
p26
aS'\xbb'
p27
aS'<'
p28
aS'\xbf'
p29
aS'@'
p30
aS'\xc3'
p31
aS'D'
p32
aS'\xc7'
p33
aS'H'
p34
aS'\xcb'
p35
aS'L'
p36
aS'\xcf'
p37
aS'P'
p38
aS'\xd3'
p39
aS'T'
p40
aS'\xd7'
p41
aS'X'
p42
aS'\xdb'
p43
aS'\\'
p44
aS'\xdf'
p45
aS'`'
p46
aS'\xe3'
p47
aS'd'
p48
aS'\xe7'
p49
aS'h'
p50
aS'\xeb'
p51
aS'l'
p52
aS'\xef'
p53
aS'p'
p54
aS'\xf3'
p55
aS't'
p56
aS'\xf7'
p57
aS'x'
p58
aS'\xfb'
p59
aS'|'
p60
aS'\xff'
p61
aS'\x80'
p62
aS'\x03'
p63
aS'\x84'
p64
aS'\x07'
p65
aS'\x88'
p66
aS'\x0b'
p67
aS'\x8c'
p68
aS'\x0f'
p69
aS'\x90'
p70
aS'\x13'
p71
aS'\x94'
p72
aS'\x17'
p73
aS'\x98'
p74
aS'\x1b'
p75
aS'\x9c'
p76
aS'\x1f'
p77
aS'\xa0'
p78
aS'#'
p79
aS'\xa4'
p80
aS"'"
p81
aS'\xa8'
p82
aS'+'
p83
aS'\xac'
p84
aS'/'
p85
aS'\xb0'
p86
aS'3'
p87
aS'\xb4'
p88
aS'7'
p89
aS'\xb8'
p90
aS';'
p91
aS'\xbc'
p92
aS'?'
p93
aS'\xc0'
p94
aS'C'
p95
aS'\xc4'
p96
aS'G'
p97
aS'\xc8'
p98
aS'K'
p99
aS'\xcc'
p100
aS'O'
p101
aS'\xd0'
p102
aS'S'
p103
aS'\xd4'
p104
aS'W'
p105
aS'\xd8'
p106
aS'['
p107
aS'\xdc'
p108
aS'_'
p109
aS'\xe0'
p110
aS'c'
p111
aS'\xe4'
p112
aS'g'
p113
aS'\xe8'
p114
aS'k'
p115
aS'\xec'
p116
aS'o'
p117
aS'\xf0'
p118
aS's'
p119
aS'\xf4'
p120
aS'w'
p121
aS'\xf8'
p122
aS'{'
p123
aS'\xfc'
p124
aS'\x7f'
p125
aS'\x81'
p126
aS'\x02'
p127
aS'\x85'
p128
aS'\x06'
p129
aS'\x89'
p130
aS'\n'
p131
aS'\x8d'
p132
aS'\x0e'
p133
aS'\x91'
p134
aS'\x12'
p135
aS'\x95'
p136
aS'\x16'
p137
aS'\x99'
p138
aS'\x1a'
p139
aS'\x9d'
p140
aS'\x1e'
p141
aS'\xa1'
p142
aS'"'
p143
aS'\xa5'
p144
aS'&'
p145
aS'\xa9'
p146
aS'*'
p147
aS'\xad'
p148
aS'.'
p149
aS'\xb1'
p150
aS'2'
p151
aS'\xb5'
p152
aS'6'
p153
aS'\xb9'
p154
aS':'
p155
aS'\xbd'
p156
aS'>'
p157
aS'\xc1'
p158
aS'B'
p159
aS'\xc5'
p160
aS'F'
p161
aS'\xc9'
p162
aS'J'
p163
aS'\xcd'
p164
aS'N'
p165
aS'\xd1'
p166
aS'R'
p167
aS'\xd5'
p168
aS'V'
p169
aS'\xd9'
p170
aS'Z'
p171
aS'\xdd'
p172
aS'^'
p173
aS'\xe1'
p174
aS'b'
p175
aS'\xe5'
p176
aS'f'
p177
aS'\xe9'
p178
aS'j'
p179
aS'\xed'
p180
aS'n'
p181
aS'\xf1'
p182
aS'r'
p183
aS'\xf5'
p184
aS'v'
p185
aS'\xf9'
p186
aS'z'
p187
aS'\xfd'
p188
aS'~'
p189
aS'\x01'
p190
aS'\x82'
p191
aS'\x05'
p192
aS'\x86'
p193
aS'\t'
p194
aS'\x8a'
p195
aS'\x8e'
p196
aS'\x11'
p197
aS'\x92'
p198
aS'\x15'
p199
aS'\x96'
p200
aS'\x19'
p201
aS'\x9a'
p202
aS'\x1d'
p203
aS'\x9e'
p204
aS'!'
p205
aS'\xa2'
p206
aS'%'
p207
aS'\xa6'
p208
aS')'
p209
aS'\xaa'
p210
aS'-'
p211
aS'\xae'
p212
aS'1'
p213
aS'\xb2'
p214
aS'5'
p215
aS'\xb6'
p216
aS'9'
p217
aS'\xba'
p218
aS'='
p219
aS'\xbe'
p220
aS'A'
p221
aS'\xc2'
p222
aS'E'
p223
aS'\xc6'
p224
aS'I'
p225
aS'\xca'
p226
aS'M'
p227
aS'\xce'
p228
aS'Q'
p229
aS'\xd2'
p230
aS'U'
p231
aS'\xd6'
p232
aS'Y'
p233
aS'\xda'
p234
aS']'
p235
aS'\xde'
p236
aS'a'
p237
aS'\xe2'
p238
aS'e'
p239
aS'\xe6'
p240
aS'i'
p241
aS'\xea'
p242
aS'm'
p243
aS'\xee'
p244
aS'q'
p245
aS'\xf2'
p246
aS'u'
p247
aS'\xf6'
p248
aS'y'
p249
aS'\xfa'
p250
aS'}'
p251
aS'\xfe'
p252
a.

View File

@ -1,26 +0,0 @@
import argparse
from model import Model
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--n_iter', type=int, default=10000,
help='number of iterations')
parser.add_argument('--save_dir', type=str, default='save',
help='dir for saving weights')
parser.add_argument('--data_path', type=str,
help='path to train data')
parser.add_argument('--learning_rate', type=int, default=0.0001,
help='learning rate')
parser.add_argument('--batch_size', type=int, default=64,
help='batch size')
parser.add_argument('--restore_from', type=str,
help='path to train saved weights')
args = parser.parse_args()
if __name__ == '__main__':
if not args.data_path:
raise Exception('please specify path to train data with --data_path')
gen = Model(args.learning_rate)
gen.train(args.data_path, args.save_dir, args.n_iter, args.batch_size, args.restore_from)

View File

@ -1,150 +0,0 @@
#!/usr/bin/python3.4
# -*- coding: utf-8 -*-
import sys
import argparse
import tempfile
import random
import subprocess
import bisect
from copy import deepcopy
# Псевдослучайный генератор уникальных чисел.
# http://preshing.com/20121224/how-to-generate-a-sequence-of-unique-random-integers/
class UniqueRandomGenerator:
prime = 4294967291
def __init__(self, seed_base, seed_offset):
self.index = self.permutePQR(self.permutePQR(seed_base) + 0x682f0161)
self.intermediate_offset = self.permutePQR(self.permutePQR(seed_offset) + 0x46790905)
def next(self):
val = self.permutePQR((self.permutePQR(self.index) + self.intermediate_offset) ^ 0x5bf03635)
self.index = self.index + 1
return val
def permutePQR(self, x):
if x >=self.prime:
return x
else:
residue = (x * x) % self.prime
if x <= self.prime/2:
return residue
else:
return self.prime - residue
# Создать таблицу содержащую уникальные значения.
def generate_data_source(host, port, http_port, min_cardinality, max_cardinality, count):
chunk_size = round((max_cardinality - (min_cardinality + 1)) / float(count))
used_values = 0
cur_count = 0
next_size = 0
sup = 32768
n1 = random.randrange(0, sup)
n2 = random.randrange(0, sup)
urng = UniqueRandomGenerator(n1, n2)
is_first = True
with tempfile.TemporaryDirectory() as tmp_dir:
filename = tmp_dir + '/table.txt'
with open(filename, 'w+b') as file_handle:
while cur_count < count:
if is_first == True:
is_first = False
if min_cardinality != 0:
next_size = min_cardinality + 1
else:
next_size = chunk_size
else:
next_size += chunk_size
while used_values < next_size:
h = urng.next()
used_values = used_values + 1
out = str(h) + "\t" + str(cur_count) + "\n";
file_handle.write(bytes(out, 'UTF-8'));
cur_count = cur_count + 1
query = "DROP TABLE IF EXISTS data_source"
subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
query = "CREATE TABLE data_source(UserID UInt64, KeyID UInt64) ENGINE=TinyLog"
subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
cat = subprocess.Popen(("cat", filename), stdout=subprocess.PIPE)
subprocess.check_output(("POST", "http://{0}:{1}/?query=INSERT INTO data_source FORMAT TabSeparated".format(host, http_port)), stdin=cat.stdout)
cat.wait()
def perform_query(host, port):
query = "SELECT runningAccumulate(uniqExactState(UserID)) AS exact, "
query += "runningAccumulate(uniqCombinedRawState(UserID)) AS raw, "
query += "runningAccumulate(uniqCombinedLinearCountingState(UserID)) AS linear_counting, "
query += "runningAccumulate(uniqCombinedBiasCorrectedState(UserID)) AS bias_corrected "
query += "FROM data_source GROUP BY KeyID"
return subprocess.check_output(["clickhouse-client", "--host", host, "--port", port, "--query", query])
def parse_clickhouse_response(response):
parsed = []
lines = response.decode().split("\n")
for cur_line in lines:
rows = cur_line.split("\t")
if len(rows) == 4:
parsed.append([float(rows[0]), float(rows[1]), float(rows[2]), float(rows[3])])
return parsed
def accumulate_data(accumulated_data, data):
if not accumulated_data:
accumulated_data = deepcopy(data)
else:
for row1, row2 in zip(accumulated_data, data):
row1[1] += row2[1];
row1[2] += row2[2];
row1[3] += row2[3];
return accumulated_data
def dump_graphs(data, count):
with open("raw_graph.txt", "w+b") as fh1, open("linear_counting_graph.txt", "w+b") as fh2, open("bias_corrected_graph.txt", "w+b") as fh3:
expected_tab = []
bias_tab = []
for row in data:
exact = row[0]
raw = row[1] / count;
linear_counting = row[2] / count;
bias_corrected = row[3] / count;
outstr = "{0}\t{1}\n".format(exact, abs(raw - exact) / exact)
fh1.write(bytes(outstr, 'UTF-8'))
outstr = "{0}\t{1}\n".format(exact, abs(linear_counting - exact) / exact)
fh2.write(bytes(outstr, 'UTF-8'))
outstr = "{0}\t{1}\n".format(exact, abs(bias_corrected - exact) / exact)
fh3.write(bytes(outstr, 'UTF-8'))
def start():
parser = argparse.ArgumentParser(description = "Generate graphs that help to determine the linear counting threshold.")
parser.add_argument("-x", "--host", default="localhost", help="clickhouse host name");
parser.add_argument("-p", "--port", type=int, default=9000, help="clickhouse client TCP port");
parser.add_argument("-t", "--http_port", type=int, default=8123, help="clickhouse HTTP port");
parser.add_argument("-i", "--iterations", type=int, default=5000, help="number of iterations");
parser.add_argument("-m", "--min_cardinality", type=int, default=16384, help="minimal cardinality");
parser.add_argument("-M", "--max_cardinality", type=int, default=655360, help="maximal cardinality");
args = parser.parse_args()
accumulated_data = []
for i in range(0, args.iterations):
print(i + 1)
sys.stdout.flush()
generate_data_source(args.host, str(args.port), str(args.http_port), args.min_cardinality, args.max_cardinality, 1000)
response = perform_query(args.host, str(args.port))
data = parse_clickhouse_response(response)
accumulated_data = accumulate_data(accumulated_data, data)
dump_graphs(accumulated_data, args.iterations)
if __name__ == "__main__": start()

View File

@ -1,10 +0,0 @@
#!/usr/bin/env bash
for (( i = 0; i < 1000; i++ )); do
if (( RANDOM % 10 )); then
clickhouse-client --port=9007 --query="INSERT INTO mt (x) SELECT rand64() AS x FROM system.numbers LIMIT 100000"
else
clickhouse-client --port=9007 --query="INSERT INTO mt (x) SELECT rand64() AS x FROM system.numbers LIMIT 300000"
fi
done

View File

@ -1,76 +0,0 @@
from __future__ import print_function
import argparse
import matplotlib.pyplot as plt
import ast
TMP_FILE='tmp.tsv'
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-f', '--file', default='data.tsv')
cfg = parser.parse_args()
return cfg
def draw():
place = dict()
max_coord = 0
global_top = 0
for line in open(TMP_FILE):
numbers = line.split('\t')
if len(numbers) <= 2:
continue
name = numbers[-2]
if numbers[0] == '1':
dx = int(numbers[3])
max_coord += dx
place[name] = [1, max_coord, 1, dx]
max_coord += dx
plt.plot([max_coord - 2 * dx, max_coord], [1, 1])
for line in open(TMP_FILE):
numbers = line.split('\t')
if len(numbers) <= 2:
continue
name = numbers[-2]
if numbers[0] == '2':
list = ast.literal_eval(numbers[-1])
coord = [0,0,0,0]
for cur_name in list:
coord[0] = max(place[cur_name][0], coord[0])
coord[1] += place[cur_name][1] * place[cur_name][2]
coord[2] += place[cur_name][2]
coord[3] += place[cur_name][3]
coord[1] /= coord[2]
coord[0] += 1
global_top = max(global_top, coord[0])
place[name] = coord
for cur_name in list:
plt.plot([coord[1], place[cur_name][1]],[coord[0], place[cur_name][0]])
plt.plot([coord[1] - coord[3], coord[1] + coord[3]], [coord[0], coord[0]])
plt.plot([0], [global_top + 1])
plt.plot([0], [-1])
plt.show()
def convert(input_file):
print(input_file)
tmp_file = open(TMP_FILE, "w")
for line in open(input_file):
numbers = line.split('\t')
numbers2 = numbers[-2].split('_')
if numbers2[-2] == numbers2[-3]:
numbers2[-2] = str(int(numbers2[-2]) + 1)
numbers2[-3] = str(int(numbers2[-3]) + 1)
numbers[-2] = '_'.join(numbers2[1:])
print('\t'.join(numbers), end='', file=tmp_file)
else:
print(line, end='', file=tmp_file)
def main():
cfg = parse_args()
convert(cfg.file)
draw()
if __name__ == '__main__':
main()

View File

@ -1,61 +0,0 @@
import time
import ast
from datetime import datetime
FILE='data.tsv'
def get_metrix():
data = []
time_to_merge = 0
count_of_parts = 0
max_count_of_parts = 0
parts_in_time = []
last_date = 0
for line in open(FILE):
fields = line.split('\t')
last_date = datetime.strptime(fields[2], '%Y-%m-%d %H:%M:%S')
break
for line in open(FILE):
fields = line.split('\t')
cur_date = datetime.strptime(fields[2], '%Y-%m-%d %H:%M:%S')
if fields[0] == '2':
time_to_merge += int(fields[4])
list = ast.literal_eval(fields[-1])
count_of_parts -= len(list) - 1
else:
count_of_parts += 1
if max_count_of_parts < count_of_parts:
max_count_of_parts = count_of_parts
parts_in_time.append([(cur_date-last_date).total_seconds(), count_of_parts])
last_date = cur_date
stats_parts_in_time = []
global_time = 0
average_parts = 0
for i in range(max_count_of_parts + 1):
stats_parts_in_time.append(0)
for elem in parts_in_time:
stats_parts_in_time[elem[1]] += elem[0]
global_time += elem[0]
average_parts += elem[0] * elem[1]
for i in range(max_count_of_parts):
stats_parts_in_time[i] /= global_time
average_parts /= global_time
return time_to_merge, max_count_of_parts, average_parts, stats_parts_in_time
def main():
time_to_merge, max_parts, average_parts, stats_parts = get_metrix()
print('time_to_merge=', time_to_merge)
print('max_parts=', max_parts)
print('average_parts=', average_parts)
print('stats_parts=', stats_parts)
if __name__ == '__main__':
main()

View File

@ -1,56 +0,0 @@
#!/usr/bin/python3
import sys
import math
import statistics as stat
start = int(sys.argv[1])
end = int(sys.argv[2])
#Copied from dbms/src/Common/HashTable/Hash.h
def intHash32(key, salt = 0):
key ^= salt;
key = (~key) + (key << 18);
key = key ^ ((key >> 31) | (key << 33));
key = key * 21;
key = key ^ ((key >> 11) | (key << 53));
key = key + (key << 6);
key = key ^ ((key >> 22) | (key << 42));
return key & 0xffffffff
#Number of buckets for precision p = 12, m = 2^p
m = 4096
n = start
c = 0
m1 = {}
m2 = {}
l1 = []
l2 = []
while n <= end:
c += 1
h = intHash32(n)
#Extract left most 12 bits
x1 = (h >> 20) & 0xfff
m1[x1] = 1
z1 = m - len(m1)
#Linear counting formula
u1 = int(m * math.log(float(m) / float(z1)))
e1 = abs(100*float(u1 - c)/float(c))
l1.append(e1)
print("%d %d %d %f" % (n, c, u1, e1))
#Extract right most 12 bits
x2 = h & 0xfff
m2[x2] = 1
z2 = m - len(m2)
u2 = int(m * math.log(float(m) / float(z2)))
e2 = abs(100*float(u2 - c)/float(c))
l2.append(e2)
print("%d %d %d %f" % (n, c, u2, e2))
n += 1
print("Left 12 bits error: min=%f max=%f avg=%f median=%f median_low=%f median_high=%f" % (min(l1), max(l1), stat.mean(l1), stat.median(l1), stat.median_low(l1), stat.median_high(l1)))
print("Right 12 bits error: min=%f max=%f avg=%f median=%f median_low=%f median_high=%f" % (min(l2), max(l2), stat.mean(l2), stat.median(l2), stat.median_low(l2), stat.median_high(l2)))

View File

@ -1,11 +0,0 @@
#!/usr/bin/env bash
for ((p = 2; p <= 10; p++))
do
for ((i = 1; i <= 9; i++))
do
n=$(( 10**p * i ))
echo -n "$n "
clickhouse-client -q "select uniqHLL12(number), uniq(number), uniqCombined(number) from numbers($n);"
done
done