diff --git a/dbms/scripts/gen_benchmark_data/generate.py b/dbms/scripts/gen_benchmark_data/generate.py new file mode 100644 index 00000000000..b54651fe1b1 --- /dev/null +++ b/dbms/scripts/gen_benchmark_data/generate.py @@ -0,0 +1,22 @@ +import argparse + +from model import Model +parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('-n', type=int, default=100000, + help='number of objects to generate') +parser.add_argument('--output_file', type=str, default='out.tsv', + help='output file name') +parser.add_argument('--weights_path', type=str, + help='path to weights') + + +args = parser.parse_args() + +if __name__ == '__main__': + if not args.weights_path: + raise Exception('please specify path to model weights with --weights_path') + + gen = Model() + gen.generate(args.n, args.output_file, args.weights_path) + diff --git a/dbms/scripts/gen_benchmark_data/model.py b/dbms/scripts/gen_benchmark_data/model.py new file mode 100644 index 00000000000..3e2ec9c4942 --- /dev/null +++ b/dbms/scripts/gen_benchmark_data/model.py @@ -0,0 +1,147 @@ +import numpy as np +import os +import pickle +import tensorflow as tf + +from random import sample +from keras.layers import Dense, Embedding +from tqdm import tqdm + +RNN_NUM_UNITS = 256 +EMB_SIZE = 32 +MAX_LENGTH = 1049 + + +with open('tokens', 'rb') as f: + tokens = pickle.load(f) +n_tokens = len(tokens) + +token_to_id = {c: i for i, c in enumerate(tokens)} + + +def to_matrix(objects, max_len=None, pad=0, dtype='int32'): + max_len = max_len or max(map(len, objects)) + matrix = np.zeros([len(objects), max_len], dtype) + pad + + for i in range(len(objects)): + name_ix = list(map(token_to_id.get, objects[i])) + matrix[i, :len(name_ix)] = name_ix + return matrix.T + + +class Model: + def __init__(self, learning_rate=0.0001): + # an embedding layer that converts character ids into embeddings + self.embed_x = Embedding(n_tokens, EMB_SIZE) + get_h_next = Dense(1024, activation='relu') + # a dense layer that maps current hidden state + # to probabilities of characters [h_t+1]->P(x_t+1|h_t+1) + self.get_probas = Dense(n_tokens, activation='softmax') + + self.input_sequence = tf.placeholder('int32', (MAX_LENGTH, None)) + batch_size = tf.shape(self.input_sequence)[1] + + self.gru_cell_first = tf.nn.rnn_cell.GRUCell(RNN_NUM_UNITS) + self.lstm_cell_second = tf.nn.rnn_cell.LSTMCell(RNN_NUM_UNITS) + + h_prev_first = self.gru_cell_first.zero_state(batch_size, dtype=tf.float32) + h_prev_second = tf.nn.rnn_cell.LSTMStateTuple( + tf.zeros([batch_size, RNN_NUM_UNITS]), # initial cell state, + tf.zeros([batch_size, RNN_NUM_UNITS]) # initial hidden state + ) + + predicted_probas = [] + for t in range(MAX_LENGTH): + x_t = self.input_sequence[t] + # convert character id into embedding + x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0] + + out_next_first, h_next_first = self.gru_cell_first(x_t_emb, h_prev_first) + h_prev_first = h_next_first + + out_next_second, h_next_second = self.lstm_cell_second(out_next_first, h_prev_second) + h_prev_second = h_next_second + + probas_next = self.get_probas(out_next_second) + predicted_probas.append(probas_next) + + predicted_probas = tf.stack(predicted_probas) + + predictions_matrix = tf.reshape(predicted_probas[:-1], [-1, len(tokens)]) + answers_matrix = tf.one_hot(tf.reshape(self.input_sequence[1:], [-1]), n_tokens) + + self.loss = tf.reduce_mean(tf.reduce_sum( + -answers_matrix * tf.log(tf.clip_by_value(predictions_matrix, 1e-7, 1.0)), + reduction_indices=[1] + )) + optimizer = tf.train.AdamOptimizer(learning_rate) + gvs = optimizer.compute_gradients(self.loss) + capped_gvs = [(gr if gr is None else tf.clip_by_value(gr, -1., 1.), var) for gr, var in gvs] + self.optimize = optimizer.apply_gradients(capped_gvs) + + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + self.saver = tf.train.Saver() + + def train(self, train_data_path, save_dir, num_iters, batch_size=64, restore_from=False): + history = [] + if restore_from: + with open(restore_from + '_history') as f: + history = pickle.load(f) + self.saver.restore(self.sess, restore_from) + with open(train_data_path, 'r') as f: + train_data = f.readlines() + + train_data = filter(lambda a: len(a) < MAX_LENGTH, train_data) + + for i in tqdm(range(num_iters)): + batch = to_matrix( + map(lambda a: '\n' + a.rstrip('\n'), sample(train_data, batch_size)), + max_len=MAX_LENGTH + ) + loss_i, _ = self.sess.run([self.loss, self.optimize], {self.input_sequence: batch}) + history.append(loss_i) + if len(history) % 2000 == 0: + self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history)))) + self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history)))) + with open(os.path.join(save_dir, '{}_iters_history'.format(len(history)))) as f: + pickle.dump(history, f) + + def generate(self, num_objects, output_file, weights_path): + self.saver.restore(self.sess, weights_path) + batch_size = num_objects + x_t = tf.placeholder('int32', (None, batch_size)) + h_t_first = tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])) + h_t_second = tf.nn.rnn_cell.LSTMStateTuple( + tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])), + tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])) + ) + + x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0] + first_out_next, next_h_first = self.gru_cell_first(x_t_emb, h_t_first) + second_out_next, next_h_second = self.lstm_cell_second(first_out_next, h_t_second) + next_probs = self.get_probas(second_out_next) + + x_sequence = np.zeros(shape=(1, batch_size), dtype=int) + token_to_id['\n'] + self.sess.run( + [tf.assign(h_t_first, h_t_first.initial_value), + tf.assign(h_t_second[0], h_t_second[0].initial_value), + tf.assign(h_t_second[1], h_t_second[1].initial_value)] + ) + + for i in tqdm(range(MAX_LENGTH - 1)): + x_probs, _, _, _ = self.sess.run( + [next_probs, + tf.assign(h_t_second[0], next_h_second[0]), + tf.assign(h_t_second[1], next_h_second[1]), + tf.assign(h_t_first, next_h_first)], + {x_t: [x_sequence[-1, :]]} + ) + + next_char = [np.random.choice(n_tokens, p=x_probs[i]) for i in range(batch_size)] + if sum(next_char) == 0: + break + x_sequence = np.append(x_sequence, [next_char], axis=0) + + with open(output_file, 'w') as f: + f.writelines([''.join([tokens[ix] for ix in x_sequence.T[k]]) + '\n' for k in range(batch_size)]) diff --git a/dbms/scripts/gen_benchmark_data/requirements.txt b/dbms/scripts/gen_benchmark_data/requirements.txt new file mode 100644 index 00000000000..b02bc51fee1 --- /dev/null +++ b/dbms/scripts/gen_benchmark_data/requirements.txt @@ -0,0 +1,3 @@ +Keras==2.0.6 +numpy +tensorflow-gpu==1.4.0 \ No newline at end of file diff --git a/dbms/scripts/gen_benchmark_data/tokens b/dbms/scripts/gen_benchmark_data/tokens new file mode 100644 index 00000000000..f80b0dd4208 --- /dev/null +++ b/dbms/scripts/gen_benchmark_data/tokens @@ -0,0 +1,506 @@ +(lp0 +S'\x83' +p1 +aS'\x04' +p2 +aS'\x87' +p3 +aS'\x8b' +p4 +aS'\x8f' +p5 +aS'\x10' +p6 +aS'\x93' +p7 +aS'\x14' +p8 +aS'\x97' +p9 +aS'\x18' +p10 +aS'\x9b' +p11 +aS'\x1c' +p12 +aS'\x9f' +p13 +aS' ' +p14 +aS'\xa3' +p15 +aS'$' +p16 +aS'\xa7' +p17 +aS'(' +p18 +aS'\xab' +p19 +aS',' +p20 +aS'\xaf' +p21 +aS'0' +p22 +aS'\xb3' +p23 +aS'4' +p24 +aS'\xb7' +p25 +aS'8' +p26 +aS'\xbb' +p27 +aS'<' +p28 +aS'\xbf' +p29 +aS'@' +p30 +aS'\xc3' +p31 +aS'D' +p32 +aS'\xc7' +p33 +aS'H' +p34 +aS'\xcb' +p35 +aS'L' +p36 +aS'\xcf' +p37 +aS'P' +p38 +aS'\xd3' +p39 +aS'T' +p40 +aS'\xd7' +p41 +aS'X' +p42 +aS'\xdb' +p43 +aS'\\' +p44 +aS'\xdf' +p45 +aS'`' +p46 +aS'\xe3' +p47 +aS'd' +p48 +aS'\xe7' +p49 +aS'h' +p50 +aS'\xeb' +p51 +aS'l' +p52 +aS'\xef' +p53 +aS'p' +p54 +aS'\xf3' +p55 +aS't' +p56 +aS'\xf7' +p57 +aS'x' +p58 +aS'\xfb' +p59 +aS'|' +p60 +aS'\xff' +p61 +aS'\x80' +p62 +aS'\x03' +p63 +aS'\x84' +p64 +aS'\x07' +p65 +aS'\x88' +p66 +aS'\x0b' +p67 +aS'\x8c' +p68 +aS'\x0f' +p69 +aS'\x90' +p70 +aS'\x13' +p71 +aS'\x94' +p72 +aS'\x17' +p73 +aS'\x98' +p74 +aS'\x1b' +p75 +aS'\x9c' +p76 +aS'\x1f' +p77 +aS'\xa0' +p78 +aS'#' +p79 +aS'\xa4' +p80 +aS"'" +p81 +aS'\xa8' +p82 +aS'+' +p83 +aS'\xac' +p84 +aS'/' +p85 +aS'\xb0' +p86 +aS'3' +p87 +aS'\xb4' +p88 +aS'7' +p89 +aS'\xb8' +p90 +aS';' +p91 +aS'\xbc' +p92 +aS'?' +p93 +aS'\xc0' +p94 +aS'C' +p95 +aS'\xc4' +p96 +aS'G' +p97 +aS'\xc8' +p98 +aS'K' +p99 +aS'\xcc' +p100 +aS'O' +p101 +aS'\xd0' +p102 +aS'S' +p103 +aS'\xd4' +p104 +aS'W' +p105 +aS'\xd8' +p106 +aS'[' +p107 +aS'\xdc' +p108 +aS'_' +p109 +aS'\xe0' +p110 +aS'c' +p111 +aS'\xe4' +p112 +aS'g' +p113 +aS'\xe8' +p114 +aS'k' +p115 +aS'\xec' +p116 +aS'o' +p117 +aS'\xf0' +p118 +aS's' +p119 +aS'\xf4' +p120 +aS'w' +p121 +aS'\xf8' +p122 +aS'{' +p123 +aS'\xfc' +p124 +aS'\x7f' +p125 +aS'\x81' +p126 +aS'\x02' +p127 +aS'\x85' +p128 +aS'\x06' +p129 +aS'\x89' +p130 +aS'\n' +p131 +aS'\x8d' +p132 +aS'\x0e' +p133 +aS'\x91' +p134 +aS'\x12' +p135 +aS'\x95' +p136 +aS'\x16' +p137 +aS'\x99' +p138 +aS'\x1a' +p139 +aS'\x9d' +p140 +aS'\x1e' +p141 +aS'\xa1' +p142 +aS'"' +p143 +aS'\xa5' +p144 +aS'&' +p145 +aS'\xa9' +p146 +aS'*' +p147 +aS'\xad' +p148 +aS'.' +p149 +aS'\xb1' +p150 +aS'2' +p151 +aS'\xb5' +p152 +aS'6' +p153 +aS'\xb9' +p154 +aS':' +p155 +aS'\xbd' +p156 +aS'>' +p157 +aS'\xc1' +p158 +aS'B' +p159 +aS'\xc5' +p160 +aS'F' +p161 +aS'\xc9' +p162 +aS'J' +p163 +aS'\xcd' +p164 +aS'N' +p165 +aS'\xd1' +p166 +aS'R' +p167 +aS'\xd5' +p168 +aS'V' +p169 +aS'\xd9' +p170 +aS'Z' +p171 +aS'\xdd' +p172 +aS'^' +p173 +aS'\xe1' +p174 +aS'b' +p175 +aS'\xe5' +p176 +aS'f' +p177 +aS'\xe9' +p178 +aS'j' +p179 +aS'\xed' +p180 +aS'n' +p181 +aS'\xf1' +p182 +aS'r' +p183 +aS'\xf5' +p184 +aS'v' +p185 +aS'\xf9' +p186 +aS'z' +p187 +aS'\xfd' +p188 +aS'~' +p189 +aS'\x01' +p190 +aS'\x82' +p191 +aS'\x05' +p192 +aS'\x86' +p193 +aS'\t' +p194 +aS'\x8a' +p195 +aS'\x8e' +p196 +aS'\x11' +p197 +aS'\x92' +p198 +aS'\x15' +p199 +aS'\x96' +p200 +aS'\x19' +p201 +aS'\x9a' +p202 +aS'\x1d' +p203 +aS'\x9e' +p204 +aS'!' +p205 +aS'\xa2' +p206 +aS'%' +p207 +aS'\xa6' +p208 +aS')' +p209 +aS'\xaa' +p210 +aS'-' +p211 +aS'\xae' +p212 +aS'1' +p213 +aS'\xb2' +p214 +aS'5' +p215 +aS'\xb6' +p216 +aS'9' +p217 +aS'\xba' +p218 +aS'=' +p219 +aS'\xbe' +p220 +aS'A' +p221 +aS'\xc2' +p222 +aS'E' +p223 +aS'\xc6' +p224 +aS'I' +p225 +aS'\xca' +p226 +aS'M' +p227 +aS'\xce' +p228 +aS'Q' +p229 +aS'\xd2' +p230 +aS'U' +p231 +aS'\xd6' +p232 +aS'Y' +p233 +aS'\xda' +p234 +aS']' +p235 +aS'\xde' +p236 +aS'a' +p237 +aS'\xe2' +p238 +aS'e' +p239 +aS'\xe6' +p240 +aS'i' +p241 +aS'\xea' +p242 +aS'm' +p243 +aS'\xee' +p244 +aS'q' +p245 +aS'\xf2' +p246 +aS'u' +p247 +aS'\xf6' +p248 +aS'y' +p249 +aS'\xfa' +p250 +aS'}' +p251 +aS'\xfe' +p252 +a. \ No newline at end of file diff --git a/dbms/scripts/gen_benchmark_data/train.py b/dbms/scripts/gen_benchmark_data/train.py new file mode 100644 index 00000000000..fd93805f50e --- /dev/null +++ b/dbms/scripts/gen_benchmark_data/train.py @@ -0,0 +1,26 @@ +import argparse + +from model import Model +parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--n_iter', type=int, default=10000, + help='number of iterations') +parser.add_argument('--save_dir', type=str, default='save', + help='dir for saving weights') +parser.add_argument('--data_path', type=str, + help='path to train data') +parser.add_argument('--learning_rate', type=int, default=0.0001, + help='learning rate') +parser.add_argument('--batch_size', type=int, default=64, + help='batch size') +parser.add_argument('--restore_from', type=str, + help='path to train saved weights') + +args = parser.parse_args() + +if __name__ == '__main__': + if not args.data_path: + raise Exception('please specify path to train data with --data_path') + + gen = Model(args.learning_rate) + gen.train(args.data_path, args.save_dir, args.n_iter, args.batch_size, args.restore_from)