Removed obsolete scripts

2024-11-10 01:25:21 +00:00 · 2019-04-22 19:10:49 +03:00 · 2019-04-22 19:10:49 +03:00 · 58b26b4279
commit 58b26b4279
parent e5ca222129
13 changed files with 0 additions and 1331 deletions
--- a/dbms/scripts/gen-bias-data.py
+++ b/dbms/scripts/gen-bias-data.py
@ -1,262 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-import sys
-import argparse
-import tempfile
-import random
-import subprocess
-import bisect
-from copy import deepcopy
-
-# Псевдослучайный генератор уникальных чисел.
-# http://preshing.com/20121224/how-to-generate-a-sequence-of-unique-random-integers/
-class UniqueRandomGenerator:
-    prime = 4294967291
-
-    def __init__(self, seed_base, seed_offset):
-        self.index = self.permutePQR(self.permutePQR(seed_base) + 0x682f0161)
-        self.intermediate_offset = self.permutePQR(self.permutePQR(seed_offset) + 0x46790905)
-
-    def next(self):
-        val = self.permutePQR((self.permutePQR(self.index) + self.intermediate_offset) ^ 0x5bf03635)
-        self.index = self.index + 1
-        return val
-
-    def permutePQR(self, x):
-        if x >=self.prime:
-            return x
-        else:
-            residue = (x * x) % self.prime
-            if x <= self.prime/2:
-                return residue
-            else:
-                return self.prime - residue
-
-# Создать таблицу содержащую уникальные значения.
-def generate_data_source(host, port, http_port, min_cardinality, max_cardinality, count):
-    chunk_size = round((max_cardinality - min_cardinality) / float(count))
-    used_values = 0
-
-    cur_count = 0
-    next_size = 0
-
-    sup = 32768
-    n1 = random.randrange(0, sup)
-    n2 = random.randrange(0, sup)
-    urng = UniqueRandomGenerator(n1, n2)
-
-    is_first = True
-
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        filename = tmp_dir + '/table.txt'
-        with open(filename, 'w+b') as file_handle:
-            while cur_count < count:
-
-                if is_first == True:
-                    is_first = False
-                    if min_cardinality != 0:
-                        next_size = min_cardinality + 1
-                    else:
-                        next_size = chunk_size
-                else:
-                    next_size += chunk_size
-
-                while used_values < next_size:
-                    h = urng.next()
-                    used_values = used_values + 1
-                    out = str(h) + "\t" + str(cur_count) + "\n";
-                    file_handle.write(bytes(out, 'UTF-8'));
-                cur_count = cur_count + 1
-
-        query = "DROP TABLE IF EXISTS data_source"
-        subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
-        query = "CREATE TABLE data_source(UserID UInt64, KeyID UInt64) ENGINE=TinyLog"
-        subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
-
-        cat = subprocess.Popen(("cat", filename), stdout=subprocess.PIPE)
-        subprocess.check_output(("POST", "http://{0}:{1}/?query=INSERT INTO data_source FORMAT TabSeparated".format(host, http_port)), stdin=cat.stdout)
-        cat.wait()
-
-def perform_query(host, port):
-    query  = "SELECT runningAccumulate(uniqExactState(UserID)) AS exact, "
-    query += "runningAccumulate(uniqCombinedRawState(UserID)) AS approx "
-    query += "FROM data_source GROUP BY KeyID"
-    return subprocess.check_output(["clickhouse-client", "--host", host, "--port", port, "--query", query])
-
-def parse_clickhouse_response(response):
-    parsed = []
-    lines = response.decode().split("\n")
-    for cur_line in lines:
-        rows = cur_line.split("\t")
-        if len(rows) == 2:
-            parsed.append([float(rows[0]), float(rows[1])])
-    return parsed
-
-def accumulate_data(accumulated_data, data):
-    if not accumulated_data:
-        accumulated_data = deepcopy(data)
-    else:
-        for row1, row2 in zip(accumulated_data, data):
-            row1[1] += row2[1];
-    return accumulated_data
-
-def generate_raw_result(accumulated_data, count):
-    expected_tab = []
-    bias_tab = []
-    for row in accumulated_data:
-        exact = row[0]
-        expected = row[1] / count
-        bias = expected - exact
-
-        expected_tab.append(expected)
-        bias_tab.append(bias)
-    return [ expected_tab, bias_tab ]
-
-def generate_sample(raw_estimates, biases, n_samples):
-    result = []
-
-    min_card = raw_estimates[0]
-    max_card = raw_estimates[len(raw_estimates) - 1]
-    step = (max_card - min_card) / (n_samples - 1)
-
-    for i in range(0, n_samples + 1):
-        x = min_card + i * step
-        j = bisect.bisect_left(raw_estimates, x)
-
-        if j == len(raw_estimates):
-            result.append((raw_estimates[j - 1], biases[j - 1]))
-        elif raw_estimates[j] == x:
-            result.append((raw_estimates[j], biases[j]))
-        else:
-            # Найти 6 ближайших соседей. Вычислить среднее арифметическое.
-
-            # 6 точек слева x [j-6 j-5 j-4 j-3 j-2 j-1]
-
-            begin = max(j - 6, 0) - 1
-            end = j - 1
-
-            T = []
-            for k in range(end, begin, -1):
-                T.append(x - raw_estimates[k])
-
-            # 6 точек справа x [j j+1 j+2 j+3 j+4 j+5]
-
-            begin = j
-            end = min(j + 5, len(raw_estimates) - 1) + 1
-
-            U = []
-            for k in range(begin, end):
-                U.append(raw_estimates[k] - x)
-
-            # Сливаем расстояния.
-
-            V = []
-
-            lim = min(len(T), len(U))
-            k1 = 0
-            k2 = 0
-
-            while k1 < lim and k2 < lim:
-                if T[k1] == U[k2]:
-                    V.append(j - k1 - 1)
-                    V.append(j + k2)
-                    k1 = k1 + 1
-                    k2 = k2 + 1
-                elif T[k1] < U[k2]:
-                    V.append(j - k1 - 1)
-                    k1 = k1 + 1
-                else:
-                    V.append(j + k2)
-                    k2 = k2 + 1
-
-            if k1 < len(T):
-                while k1 < len(T):
-                    V.append(j - k1 - 1)
-                    k1 = k1 + 1
-            elif k2 < len(U):
-                while k2 < len(U):
-                    V.append(j + k2)
-                    k2 = k2 + 1
-
-            # Выбираем 6 ближайших точек.
-            # Вычисляем средние.
-
-            begin = 0
-            end = min(len(V), 6)
-
-            sum = 0
-            bias = 0
-            for k in range(begin, end):
-                sum += raw_estimates[V[k]]
-                bias += biases[V[k]]
-            sum /= float(end)
-            bias /= float(end)
-
-            result.append((sum, bias))
-
-    # Пропустить последовательные результаты, чьи оценки одинаковые.
-    final_result = []
-    last = -1
-    for entry in result:
-        if entry[0] != last:
-            final_result.append((entry[0], entry[1]))
-            last = entry[0]
-
-    return final_result
-
-def dump_arrays(data):
-
-    print("Size of each array: {0}\n".format(len(data)))
-
-    is_first = True
-    sep = ''
-
-    print("raw_estimates = ")
-    print("{")
-    for row in data:
-        print("\t{0}{1}".format(sep, row[0]))
-        if is_first == True:
-            is_first = False
-            sep = ","
-    print("};")
-
-    is_first = True
-    sep = ""
-
-    print("\nbiases = ")
-    print("{")
-    for row in data:
-        print("\t{0}{1}".format(sep, row[1]))
-        if is_first == True:
-            is_first = False
-            sep = ","
-    print("};")
-
-def start():
-    parser = argparse.ArgumentParser(description = "Generate bias correction tables for HyperLogLog-based functions.")
-    parser.add_argument("-x", "--host", default="localhost", help="ClickHouse server host name");
-    parser.add_argument("-p", "--port", type=int, default=9000, help="ClickHouse server TCP port");
-    parser.add_argument("-t", "--http_port", type=int, default=8123, help="ClickHouse server HTTP port");
-    parser.add_argument("-i", "--iterations", type=int, default=5000, help="number of iterations");
-    parser.add_argument("-m", "--min_cardinality", type=int, default=16384, help="minimal cardinality");
-    parser.add_argument("-M", "--max_cardinality", type=int, default=655360, help="maximal cardinality");
-    parser.add_argument("-s", "--samples", type=int, default=200, help="number of sampled values");
-    args = parser.parse_args()
-
-    accumulated_data = []
-
-    for i in range(0, args.iterations):
-        print(i + 1)
-        sys.stdout.flush()
-
-        generate_data_source(args.host, str(args.port), str(args.http_port), args.min_cardinality, args.max_cardinality, 1000)
-        response = perform_query(args.host, str(args.port))
-        data = parse_clickhouse_response(response)
-        accumulated_data = accumulate_data(accumulated_data, data)
-
-    result = generate_raw_result(accumulated_data, args.iterations)
-    sampled_data = generate_sample(result[0], result[1], args.samples)
-    dump_arrays(sampled_data)
-
-if __name__ == "__main__": start()
--- a/dbms/scripts/gen_benchmark_data/README.md
+++ b/dbms/scripts/gen_benchmark_data/README.md
@ -1 +0,0 @@
-Hits table generator based on LSTM neural network trained on real hits. You need to have weights for model or train model on real hits to generate data.
--- a/dbms/scripts/gen_benchmark_data/generate.py
+++ b/dbms/scripts/gen_benchmark_data/generate.py
@ -1,22 +0,0 @@
-import argparse
-
-from model import Model
-parser = argparse.ArgumentParser(
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('-n', type=int, default=100000,
-                    help='number of objects to generate')
-parser.add_argument('--output_file', type=str, default='out.tsv',
-                    help='output file name')
-parser.add_argument('--weights_path', type=str,
-                    help='path to weights')
-
-
-args = parser.parse_args()
-
-if __name__ == '__main__':
-    if not args.weights_path:
-        raise Exception('please specify path to model weights with --weights_path')
-
-    gen = Model()
-    gen.generate(args.n, args.output_file, args.weights_path)
-
--- a/dbms/scripts/gen_benchmark_data/model.py
+++ b/dbms/scripts/gen_benchmark_data/model.py
@ -1,147 +0,0 @@
-import numpy as np
-import os
-import pickle
-import tensorflow as tf
-
-from random import sample
-from keras.layers import Dense, Embedding
-from tqdm import tqdm
-
-RNN_NUM_UNITS = 256
-EMB_SIZE = 32
-MAX_LENGTH = 1049
-
-
-with open('tokens', 'rb') as f:
-    tokens = pickle.load(f)
-n_tokens = len(tokens)
-
-token_to_id = {c: i for i, c in enumerate(tokens)}
-
-
-def to_matrix(objects, max_len=None, pad=0, dtype='int32'):
-    max_len = max_len or max(map(len, objects))
-    matrix = np.zeros([len(objects), max_len], dtype) + pad
-
-    for i in range(len(objects)):
-        name_ix = list(map(token_to_id.get, objects[i]))
-        matrix[i, :len(name_ix)] = name_ix
-    return matrix.T
-
-
-class Model:
-    def __init__(self, learning_rate=0.0001):
-        # an embedding layer that converts character ids into embeddings
-        self.embed_x = Embedding(n_tokens, EMB_SIZE)
-        get_h_next = Dense(1024, activation='relu')
-        # a dense layer that maps current hidden state
-        # to probabilities of characters [h_t+1]->P(x_t+1|h_t+1)
-        self.get_probas = Dense(n_tokens, activation='softmax')
-
-        self.input_sequence = tf.placeholder('int32', (MAX_LENGTH, None))
-        batch_size = tf.shape(self.input_sequence)[1]
-
-        self.gru_cell_first = tf.nn.rnn_cell.GRUCell(RNN_NUM_UNITS)
-        self.lstm_cell_second = tf.nn.rnn_cell.LSTMCell(RNN_NUM_UNITS)
-
-        h_prev_first = self.gru_cell_first.zero_state(batch_size, dtype=tf.float32)
-        h_prev_second = tf.nn.rnn_cell.LSTMStateTuple(
-            tf.zeros([batch_size, RNN_NUM_UNITS]),  # initial cell state,
-            tf.zeros([batch_size, RNN_NUM_UNITS])  # initial hidden state
-        )
-
-        predicted_probas = []
-        for t in range(MAX_LENGTH):
-            x_t = self.input_sequence[t]
-            # convert character id into embedding
-            x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]
-
-            out_next_first, h_next_first = self.gru_cell_first(x_t_emb, h_prev_first)
-            h_prev_first = h_next_first
-
-            out_next_second, h_next_second = self.lstm_cell_second(out_next_first, h_prev_second)
-            h_prev_second = h_next_second
-
-            probas_next = self.get_probas(out_next_second)
-            predicted_probas.append(probas_next)
-
-        predicted_probas = tf.stack(predicted_probas)
-
-        predictions_matrix = tf.reshape(predicted_probas[:-1], [-1, len(tokens)])
-        answers_matrix = tf.one_hot(tf.reshape(self.input_sequence[1:], [-1]), n_tokens)
-
-        self.loss = tf.reduce_mean(tf.reduce_sum(
-            -answers_matrix * tf.log(tf.clip_by_value(predictions_matrix, 1e-7, 1.0)),
-            reduction_indices=[1]
-        ))
-        optimizer = tf.train.AdamOptimizer(learning_rate)
-        gvs = optimizer.compute_gradients(self.loss)
-        capped_gvs = [(gr if gr is None else tf.clip_by_value(gr, -1., 1.), var) for gr, var in gvs]
-        self.optimize = optimizer.apply_gradients(capped_gvs)
-
-        self.sess = tf.Session()
-        self.sess.run(tf.global_variables_initializer())
-        self.saver = tf.train.Saver()
-
-    def train(self, train_data_path, save_dir, num_iters, batch_size=64, restore_from=False):
-        history = []
-        if restore_from:
-            with open(restore_from + '_history') as f:
-                history = pickle.load(f)
-            self.saver.restore(self.sess, restore_from)
-        with open(train_data_path, 'r') as f:
-            train_data = f.readlines()
-
-        train_data = filter(lambda a: len(a) < MAX_LENGTH, train_data)
-
-        for i in tqdm(range(num_iters)):
-            batch = to_matrix(
-                map(lambda a: '\n' + a.rstrip('\n'), sample(train_data, batch_size)),
-                max_len=MAX_LENGTH
-            )
-            loss_i, _ = self.sess.run([self.loss, self.optimize], {self.input_sequence: batch})
-            history.append(loss_i)
-            if len(history) % 2000 == 0:
-                self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
-        self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
-        with open(os.path.join(save_dir, '{}_iters_history'.format(len(history)))) as f:
-            pickle.dump(history, f)
-
-    def generate(self, num_objects, output_file, weights_path):
-        self.saver.restore(self.sess, weights_path)
-        batch_size = num_objects
-        x_t = tf.placeholder('int32', (None, batch_size))
-        h_t_first = tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
-        h_t_second = tf.nn.rnn_cell.LSTMStateTuple(
-            tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])),
-            tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
-        )
-
-        x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]
-        first_out_next, next_h_first = self.gru_cell_first(x_t_emb, h_t_first)
-        second_out_next, next_h_second = self.lstm_cell_second(first_out_next, h_t_second)
-        next_probs = self.get_probas(second_out_next)
-
-        x_sequence = np.zeros(shape=(1, batch_size), dtype=int) + token_to_id['\n']
-        self.sess.run(
-            [tf.assign(h_t_first, h_t_first.initial_value),
-             tf.assign(h_t_second[0], h_t_second[0].initial_value),
-             tf.assign(h_t_second[1], h_t_second[1].initial_value)]
-        )
-
-        for i in tqdm(range(MAX_LENGTH - 1)):
-            x_probs, _, _, _ = self.sess.run(
-                [next_probs,
-                 tf.assign(h_t_second[0], next_h_second[0]),
-                 tf.assign(h_t_second[1], next_h_second[1]),
-                 tf.assign(h_t_first, next_h_first)],
-                {x_t: [x_sequence[-1, :]]}
-            )
-
-            next_char = [np.random.choice(n_tokens, p=x_probs[i]) for i in range(batch_size)]
-            if sum(next_char) == 0:
-                break
-            x_sequence = np.append(x_sequence, [next_char], axis=0)
-
-        with open(output_file, 'w') as f:
-            f.writelines([''.join([tokens[ix] for ix in x_sequence.T[k]]) + '\n' for k in range(batch_size)])
--- a/dbms/scripts/gen_benchmark_data/requirements.txt
+++ b/dbms/scripts/gen_benchmark_data/requirements.txt
@ -1,3 +0,0 @@
-Keras==2.0.6
-numpy
-tensorflow-gpu==1.4.0
--- a/dbms/scripts/gen_benchmark_data/tokens
+++ b/dbms/scripts/gen_benchmark_data/tokens
@ -1,506 +0,0 @@
-(lp0
-S'\x83'
-p1
-aS'\x04'
-p2
-aS'\x87'
-p3
-aS'\x8b'
-p4
-aS'\x8f'
-p5
-aS'\x10'
-p6
-aS'\x93'
-p7
-aS'\x14'
-p8
-aS'\x97'
-p9
-aS'\x18'
-p10
-aS'\x9b'
-p11
-aS'\x1c'
-p12
-aS'\x9f'
-p13
-aS' '
-p14
-aS'\xa3'
-p15
-aS'$'
-p16
-aS'\xa7'
-p17
-aS'('
-p18
-aS'\xab'
-p19
-aS','
-p20
-aS'\xaf'
-p21
-aS'0'
-p22
-aS'\xb3'
-p23
-aS'4'
-p24
-aS'\xb7'
-p25
-aS'8'
-p26
-aS'\xbb'
-p27
-aS'<'
-p28
-aS'\xbf'
-p29
-aS'@'
-p30
-aS'\xc3'
-p31
-aS'D'
-p32
-aS'\xc7'
-p33
-aS'H'
-p34
-aS'\xcb'
-p35
-aS'L'
-p36
-aS'\xcf'
-p37
-aS'P'
-p38
-aS'\xd3'
-p39
-aS'T'
-p40
-aS'\xd7'
-p41
-aS'X'
-p42
-aS'\xdb'
-p43
-aS'\\'
-p44
-aS'\xdf'
-p45
-aS'`'
-p46
-aS'\xe3'
-p47
-aS'd'
-p48
-aS'\xe7'
-p49
-aS'h'
-p50
-aS'\xeb'
-p51
-aS'l'
-p52
-aS'\xef'
-p53
-aS'p'
-p54
-aS'\xf3'
-p55
-aS't'
-p56
-aS'\xf7'
-p57
-aS'x'
-p58
-aS'\xfb'
-p59
-aS'|'
-p60
-aS'\xff'
-p61
-aS'\x80'
-p62
-aS'\x03'
-p63
-aS'\x84'
-p64
-aS'\x07'
-p65
-aS'\x88'
-p66
-aS'\x0b'
-p67
-aS'\x8c'
-p68
-aS'\x0f'
-p69
-aS'\x90'
-p70
-aS'\x13'
-p71
-aS'\x94'
-p72
-aS'\x17'
-p73
-aS'\x98'
-p74
-aS'\x1b'
-p75
-aS'\x9c'
-p76
-aS'\x1f'
-p77
-aS'\xa0'
-p78
-aS'#'
-p79
-aS'\xa4'
-p80
-aS"'"
-p81
-aS'\xa8'
-p82
-aS'+'
-p83
-aS'\xac'
-p84
-aS'/'
-p85
-aS'\xb0'
-p86
-aS'3'
-p87
-aS'\xb4'
-p88
-aS'7'
-p89
-aS'\xb8'
-p90
-aS';'
-p91
-aS'\xbc'
-p92
-aS'?'
-p93
-aS'\xc0'
-p94
-aS'C'
-p95
-aS'\xc4'
-p96
-aS'G'
-p97
-aS'\xc8'
-p98
-aS'K'
-p99
-aS'\xcc'
-p100
-aS'O'
-p101
-aS'\xd0'
-p102
-aS'S'
-p103
-aS'\xd4'
-p104
-aS'W'
-p105
-aS'\xd8'
-p106
-aS'['
-p107
-aS'\xdc'
-p108
-aS'_'
-p109
-aS'\xe0'
-p110
-aS'c'
-p111
-aS'\xe4'
-p112
-aS'g'
-p113
-aS'\xe8'
-p114
-aS'k'
-p115
-aS'\xec'
-p116
-aS'o'
-p117
-aS'\xf0'
-p118
-aS's'
-p119
-aS'\xf4'
-p120
-aS'w'
-p121
-aS'\xf8'
-p122
-aS'{'
-p123
-aS'\xfc'
-p124
-aS'\x7f'
-p125
-aS'\x81'
-p126
-aS'\x02'
-p127
-aS'\x85'
-p128
-aS'\x06'
-p129
-aS'\x89'
-p130
-aS'\n'
-p131
-aS'\x8d'
-p132
-aS'\x0e'
-p133
-aS'\x91'
-p134
-aS'\x12'
-p135
-aS'\x95'
-p136
-aS'\x16'
-p137
-aS'\x99'
-p138
-aS'\x1a'
-p139
-aS'\x9d'
-p140
-aS'\x1e'
-p141
-aS'\xa1'
-p142
-aS'"'
-p143
-aS'\xa5'
-p144
-aS'&'
-p145
-aS'\xa9'
-p146
-aS'*'
-p147
-aS'\xad'
-p148
-aS'.'
-p149
-aS'\xb1'
-p150
-aS'2'
-p151
-aS'\xb5'
-p152
-aS'6'
-p153
-aS'\xb9'
-p154
-aS':'
-p155
-aS'\xbd'
-p156
-aS'>'
-p157
-aS'\xc1'
-p158
-aS'B'
-p159
-aS'\xc5'
-p160
-aS'F'
-p161
-aS'\xc9'
-p162
-aS'J'
-p163
-aS'\xcd'
-p164
-aS'N'
-p165
-aS'\xd1'
-p166
-aS'R'
-p167
-aS'\xd5'
-p168
-aS'V'
-p169
-aS'\xd9'
-p170
-aS'Z'
-p171
-aS'\xdd'
-p172
-aS'^'
-p173
-aS'\xe1'
-p174
-aS'b'
-p175
-aS'\xe5'
-p176
-aS'f'
-p177
-aS'\xe9'
-p178
-aS'j'
-p179
-aS'\xed'
-p180
-aS'n'
-p181
-aS'\xf1'
-p182
-aS'r'
-p183
-aS'\xf5'
-p184
-aS'v'
-p185
-aS'\xf9'
-p186
-aS'z'
-p187
-aS'\xfd'
-p188
-aS'~'
-p189
-aS'\x01'
-p190
-aS'\x82'
-p191
-aS'\x05'
-p192
-aS'\x86'
-p193
-aS'\t'
-p194
-aS'\x8a'
-p195
-aS'\x8e'
-p196
-aS'\x11'
-p197
-aS'\x92'
-p198
-aS'\x15'
-p199
-aS'\x96'
-p200
-aS'\x19'
-p201
-aS'\x9a'
-p202
-aS'\x1d'
-p203
-aS'\x9e'
-p204
-aS'!'
-p205
-aS'\xa2'
-p206
-aS'%'
-p207
-aS'\xa6'
-p208
-aS')'
-p209
-aS'\xaa'
-p210
-aS'-'
-p211
-aS'\xae'
-p212
-aS'1'
-p213
-aS'\xb2'
-p214
-aS'5'
-p215
-aS'\xb6'
-p216
-aS'9'
-p217
-aS'\xba'
-p218
-aS'='
-p219
-aS'\xbe'
-p220
-aS'A'
-p221
-aS'\xc2'
-p222
-aS'E'
-p223
-aS'\xc6'
-p224
-aS'I'
-p225
-aS'\xca'
-p226
-aS'M'
-p227
-aS'\xce'
-p228
-aS'Q'
-p229
-aS'\xd2'
-p230
-aS'U'
-p231
-aS'\xd6'
-p232
-aS'Y'
-p233
-aS'\xda'
-p234
-aS']'
-p235
-aS'\xde'
-p236
-aS'a'
-p237
-aS'\xe2'
-p238
-aS'e'
-p239
-aS'\xe6'
-p240
-aS'i'
-p241
-aS'\xea'
-p242
-aS'm'
-p243
-aS'\xee'
-p244
-aS'q'
-p245
-aS'\xf2'
-p246
-aS'u'
-p247
-aS'\xf6'
-p248
-aS'y'
-p249
-aS'\xfa'
-p250
-aS'}'
-p251
-aS'\xfe'
-p252
-a.
--- a/dbms/scripts/gen_benchmark_data/train.py
+++ b/dbms/scripts/gen_benchmark_data/train.py
@ -1,26 +0,0 @@
-import argparse
-
-from model import Model
-parser = argparse.ArgumentParser(
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--n_iter', type=int, default=10000,
-                    help='number of iterations')
-parser.add_argument('--save_dir', type=str, default='save',
-                    help='dir for saving weights')
-parser.add_argument('--data_path', type=str,
-                    help='path to train data')
-parser.add_argument('--learning_rate', type=int, default=0.0001,
-                    help='learning rate')
-parser.add_argument('--batch_size', type=int, default=64,
-                    help='batch size')
-parser.add_argument('--restore_from', type=str,
-                    help='path to train saved weights')
-
-args = parser.parse_args()
-
-if __name__ == '__main__':
-    if not args.data_path:
-        raise Exception('please specify path to train data with --data_path')
-
-    gen = Model(args.learning_rate)
-    gen.train(args.data_path, args.save_dir, args.n_iter, args.batch_size, args.restore_from)
--- a/dbms/scripts/linear-counting-threshold.py
+++ b/dbms/scripts/linear-counting-threshold.py
@ -1,150 +0,0 @@
-#!/usr/bin/python3.4
-# -*- coding: utf-8 -*-
-
-import sys
-import argparse
-import tempfile
-import random
-import subprocess
-import bisect
-from copy import deepcopy
-
-# Псевдослучайный генератор уникальных чисел.
-# http://preshing.com/20121224/how-to-generate-a-sequence-of-unique-random-integers/
-class UniqueRandomGenerator:
-    prime = 4294967291
-
-    def __init__(self, seed_base, seed_offset):
-        self.index = self.permutePQR(self.permutePQR(seed_base) + 0x682f0161)
-        self.intermediate_offset = self.permutePQR(self.permutePQR(seed_offset) + 0x46790905)
-
-    def next(self):
-        val = self.permutePQR((self.permutePQR(self.index) + self.intermediate_offset) ^ 0x5bf03635)
-        self.index = self.index + 1
-        return val
-
-    def permutePQR(self, x):
-        if x >=self.prime:
-            return x
-        else:
-            residue = (x * x) % self.prime
-            if x <= self.prime/2:
-                return residue
-            else:
-                return self.prime - residue
-
-# Создать таблицу содержащую уникальные значения.
-def generate_data_source(host, port, http_port, min_cardinality, max_cardinality, count):
-    chunk_size = round((max_cardinality - (min_cardinality + 1)) / float(count))
-    used_values = 0
-
-    cur_count = 0
-    next_size = 0
-
-    sup = 32768
-    n1 = random.randrange(0, sup)
-    n2 = random.randrange(0, sup)
-    urng = UniqueRandomGenerator(n1, n2)
-
-    is_first = True
-
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        filename = tmp_dir + '/table.txt'
-        with open(filename, 'w+b') as file_handle:
-            while cur_count < count:
-
-                if is_first == True:
-                    is_first = False
-                    if min_cardinality != 0:
-                        next_size = min_cardinality + 1
-                    else:
-                        next_size = chunk_size
-                else:
-                    next_size += chunk_size
-
-                while used_values < next_size:
-                    h = urng.next()
-                    used_values = used_values + 1
-                    out = str(h) + "\t" + str(cur_count) + "\n";
-                    file_handle.write(bytes(out, 'UTF-8'));
-                cur_count = cur_count + 1
-
-        query = "DROP TABLE IF EXISTS data_source"
-        subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
-        query = "CREATE TABLE data_source(UserID UInt64, KeyID UInt64) ENGINE=TinyLog"
-        subprocess.check_output(["clickhouse-client", "--host", host, "--port", str(port), "--query", query])
-
-        cat = subprocess.Popen(("cat", filename), stdout=subprocess.PIPE)
-        subprocess.check_output(("POST", "http://{0}:{1}/?query=INSERT INTO data_source FORMAT TabSeparated".format(host, http_port)), stdin=cat.stdout)
-        cat.wait()
-
-def perform_query(host, port):
-    query  = "SELECT runningAccumulate(uniqExactState(UserID)) AS exact, "
-    query += "runningAccumulate(uniqCombinedRawState(UserID)) AS raw, "
-    query += "runningAccumulate(uniqCombinedLinearCountingState(UserID)) AS linear_counting, "
-    query += "runningAccumulate(uniqCombinedBiasCorrectedState(UserID)) AS bias_corrected "
-    query += "FROM data_source GROUP BY KeyID"
-    return subprocess.check_output(["clickhouse-client", "--host", host, "--port", port, "--query", query])
-
-def parse_clickhouse_response(response):
-    parsed = []
-    lines = response.decode().split("\n")
-    for cur_line in lines:
-        rows = cur_line.split("\t")
-        if len(rows) == 4:
-            parsed.append([float(rows[0]), float(rows[1]), float(rows[2]), float(rows[3])])
-    return parsed
-
-def accumulate_data(accumulated_data, data):
-    if not accumulated_data:
-        accumulated_data = deepcopy(data)
-    else:
-        for row1, row2 in zip(accumulated_data, data):
-            row1[1] += row2[1];
-            row1[2] += row2[2];
-            row1[3] += row2[3];
-    return accumulated_data
-
-def dump_graphs(data, count):
-    with open("raw_graph.txt", "w+b") as fh1, open("linear_counting_graph.txt", "w+b") as fh2, open("bias_corrected_graph.txt", "w+b") as fh3:
-        expected_tab = []
-        bias_tab = []
-        for row in data:
-            exact = row[0]
-            raw = row[1] / count;
-            linear_counting = row[2] / count;
-            bias_corrected = row[3] / count;
-
-            outstr = "{0}\t{1}\n".format(exact, abs(raw - exact) / exact)
-            fh1.write(bytes(outstr, 'UTF-8'))
-
-            outstr = "{0}\t{1}\n".format(exact, abs(linear_counting - exact) / exact)
-            fh2.write(bytes(outstr, 'UTF-8'))
-
-            outstr = "{0}\t{1}\n".format(exact, abs(bias_corrected - exact) / exact)
-            fh3.write(bytes(outstr, 'UTF-8'))
-
-def start():
-    parser = argparse.ArgumentParser(description = "Generate graphs that help to determine the linear counting threshold.")
-    parser.add_argument("-x", "--host", default="localhost", help="clickhouse host name");
-    parser.add_argument("-p", "--port", type=int, default=9000, help="clickhouse client TCP port");
-    parser.add_argument("-t", "--http_port", type=int, default=8123, help="clickhouse HTTP port");
-    parser.add_argument("-i", "--iterations", type=int, default=5000, help="number of iterations");
-    parser.add_argument("-m", "--min_cardinality", type=int, default=16384, help="minimal cardinality");
-    parser.add_argument("-M", "--max_cardinality", type=int, default=655360, help="maximal cardinality");
-    args = parser.parse_args()
-
-    accumulated_data = []
-
-    for i in range(0, args.iterations):
-        print(i + 1)
-        sys.stdout.flush()
-
-        generate_data_source(args.host, str(args.port), str(args.http_port), args.min_cardinality, args.max_cardinality, 1000)
-        response = perform_query(args.host, str(args.port))
-        data = parse_clickhouse_response(response)
-        accumulated_data = accumulate_data(accumulated_data, data)
-
-    dump_graphs(accumulated_data, args.iterations)
-
-if __name__ == "__main__": start()
--- a/dbms/scripts/merge_algorithm/add_parts.sh
+++ b/dbms/scripts/merge_algorithm/add_parts.sh
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-for (( i = 0; i < 1000; i++ )); do
-    if (( RANDOM % 10 )); then
-        clickhouse-client --port=9007 --query="INSERT INTO mt (x) SELECT rand64() AS x FROM system.numbers LIMIT 100000"
-    else
-        clickhouse-client --port=9007 --query="INSERT INTO mt (x) SELECT rand64() AS x FROM system.numbers LIMIT 300000"
-    fi
-
-done
--- a/dbms/scripts/merge_algorithm/drawer.py
+++ b/dbms/scripts/merge_algorithm/drawer.py
@ -1,76 +0,0 @@
-from __future__ import print_function
-
-import argparse
-import matplotlib.pyplot as plt
-import ast
-
-TMP_FILE='tmp.tsv'
-
-def parse_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('-f', '--file', default='data.tsv')
-    cfg = parser.parse_args()
-    return cfg
-
-def draw():
-    place = dict()
-    max_coord = 0
-    global_top = 0
-    for line in open(TMP_FILE):
-        numbers = line.split('\t')
-        if len(numbers) <= 2:
-            continue
-        name = numbers[-2]
-        if numbers[0] == '1':
-            dx = int(numbers[3])
-            max_coord += dx
-            place[name] = [1, max_coord, 1, dx]
-            max_coord += dx
-            plt.plot([max_coord - 2 * dx, max_coord], [1, 1])
-    for line in open(TMP_FILE):
-        numbers = line.split('\t')
-        if len(numbers) <= 2:
-            continue
-        name = numbers[-2]
-        if numbers[0] == '2':
-            list = ast.literal_eval(numbers[-1])
-            coord = [0,0,0,0]
-            for cur_name in list:
-                coord[0] = max(place[cur_name][0], coord[0])
-                coord[1] += place[cur_name][1] * place[cur_name][2]
-                coord[2] += place[cur_name][2]
-                coord[3] += place[cur_name][3]
-            coord[1] /= coord[2]
-            coord[0] += 1
-            global_top = max(global_top, coord[0])
-            place[name] = coord
-            for cur_name in list:
-                plt.plot([coord[1], place[cur_name][1]],[coord[0], place[cur_name][0]])
-            plt.plot([coord[1] - coord[3], coord[1] + coord[3]], [coord[0], coord[0]])
-    plt.plot([0], [global_top + 1])
-    plt.plot([0], [-1])
-    plt.show()
-
-
-def convert(input_file):
-    print(input_file)
-    tmp_file = open(TMP_FILE, "w")
-    for line in open(input_file):
-        numbers = line.split('\t')
-        numbers2 = numbers[-2].split('_')
-        if numbers2[-2] == numbers2[-3]:
-            numbers2[-2] = str(int(numbers2[-2]) + 1)
-            numbers2[-3] = str(int(numbers2[-3]) + 1)
-            numbers[-2] = '_'.join(numbers2[1:])
-            print('\t'.join(numbers), end='', file=tmp_file)
-        else:
-            print(line, end='', file=tmp_file)
-
-def main():
-    cfg = parse_args()
-    convert(cfg.file)
-    draw()
-
-if __name__ == '__main__':
-    main()
-
--- a/dbms/scripts/merge_algorithm/stats.py
+++ b/dbms/scripts/merge_algorithm/stats.py
@ -1,61 +0,0 @@
-import time
-import ast
-from datetime import datetime
-
-FILE='data.tsv'
-
-def get_metrix():
-    data = []
-    time_to_merge = 0
-    count_of_parts = 0
-    max_count_of_parts = 0
-    parts_in_time = []
-    last_date = 0
-    for line in open(FILE):
-        fields = line.split('\t')
-        last_date = datetime.strptime(fields[2], '%Y-%m-%d %H:%M:%S')
-        break
-
-    for line in open(FILE):
-        fields = line.split('\t')
-        cur_date = datetime.strptime(fields[2], '%Y-%m-%d %H:%M:%S')
-        if fields[0] == '2':
-            time_to_merge += int(fields[4])
-            list = ast.literal_eval(fields[-1])
-            count_of_parts -= len(list) - 1
-        else:
-            count_of_parts += 1
-
-        if max_count_of_parts < count_of_parts:
-            max_count_of_parts = count_of_parts
-
-        parts_in_time.append([(cur_date-last_date).total_seconds(), count_of_parts])
-        last_date = cur_date
-
-    stats_parts_in_time = []
-    global_time = 0
-    average_parts = 0
-    for i in range(max_count_of_parts + 1):
-        stats_parts_in_time.append(0)
-
-    for elem in parts_in_time:
-        stats_parts_in_time[elem[1]] += elem[0]
-        global_time += elem[0]
-        average_parts += elem[0] * elem[1]
-
-    for i in range(max_count_of_parts):
-        stats_parts_in_time[i] /= global_time
-    average_parts /= global_time
-
-    return time_to_merge, max_count_of_parts, average_parts, stats_parts_in_time
-
-def main():
-   time_to_merge, max_parts, average_parts, stats_parts = get_metrix()
-   print('time_to_merge=', time_to_merge)
-   print('max_parts=', max_parts)
-   print('average_parts=', average_parts)
-   print('stats_parts=', stats_parts)
-
-
-if __name__ == '__main__':
-    main()
--- a/dbms/scripts/test_intHash32_for_linear_counting.py
+++ b/dbms/scripts/test_intHash32_for_linear_counting.py
@ -1,56 +0,0 @@
-#!/usr/bin/python3
-import sys
-import math
-import statistics as stat
-
-start = int(sys.argv[1])
-end = int(sys.argv[2])
-
-#Copied from dbms/src/Common/HashTable/Hash.h
-def intHash32(key, salt = 0):
-    key ^= salt;
-
-    key = (~key) + (key << 18);
-    key = key ^ ((key >> 31) | (key << 33));
-    key = key * 21;
-    key = key ^ ((key >> 11) | (key << 53));
-    key = key + (key << 6);
-    key = key ^ ((key >> 22) | (key << 42));
-
-    return key & 0xffffffff
-
-#Number of buckets for precision p = 12, m = 2^p
-m = 4096
-n = start
-c = 0
-m1 = {}
-m2 = {}
-l1 = []
-l2 = []
-while n <= end:
-    c += 1
-
-    h = intHash32(n)
-    #Extract left most 12 bits
-    x1 = (h >> 20) & 0xfff
-    m1[x1] = 1
-    z1 = m - len(m1)
-    #Linear counting formula
-    u1 = int(m * math.log(float(m) / float(z1)))
-    e1 = abs(100*float(u1 - c)/float(c))
-    l1.append(e1)
-    print("%d %d %d %f" % (n, c, u1, e1))
-
-    #Extract right most 12 bits
-    x2 = h & 0xfff
-    m2[x2] = 1
-    z2 = m - len(m2)
-    u2 = int(m * math.log(float(m) / float(z2)))
-    e2 = abs(100*float(u2 - c)/float(c))
-    l2.append(e2)
-    print("%d %d %d %f" % (n, c, u2, e2))
-
-    n += 1
-
-print("Left 12 bits error: min=%f max=%f avg=%f median=%f median_low=%f median_high=%f" % (min(l1), max(l1), stat.mean(l1), stat.median(l1), stat.median_low(l1), stat.median_high(l1)))
-print("Right 12 bits error: min=%f max=%f avg=%f median=%f median_low=%f median_high=%f" % (min(l2), max(l2), stat.mean(l2), stat.median(l2), stat.median_low(l2), stat.median_high(l2)))
--- a/dbms/scripts/test_uniq_functions.sh
+++ b/dbms/scripts/test_uniq_functions.sh
@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-for ((p = 2; p <= 10; p++))
-do
-    for ((i = 1; i <= 9; i++))
-    do
-        n=$(( 10**p * i ))
-        echo -n "$n "
-        clickhouse-client -q "select uniqHLL12(number), uniq(number), uniqCombined(number) from numbers($n);"
-    done
-done
				`@ -1 +0,0 @@`
				`Hits table generator based on LSTM neural network trained on real hits. You need to have weights for model or train model on real hits to generate data.`