ClickHouse/dbms/scripts/gen_benchmark_data/model.py

import numpy as np
import os
import pickle
import tensorflow as tf

from random import sample
from keras.layers import Dense, Embedding
from tqdm import tqdm

RNN_NUM_UNITS = 256
EMB_SIZE = 32
MAX_LENGTH = 1049


with open('tokens', 'rb') as f:
    tokens = pickle.load(f)
n_tokens = len(tokens)

token_to_id = {c: i for i, c in enumerate(tokens)}


def to_matrix(objects, max_len=None, pad=0, dtype='int32'):
    max_len = max_len or max(map(len, objects))
    matrix = np.zeros([len(objects), max_len], dtype) + pad

    for i in range(len(objects)):
        name_ix = list(map(token_to_id.get, objects[i]))
        matrix[i, :len(name_ix)] = name_ix
    return matrix.T


class Model:
    def __init__(self, learning_rate=0.0001):
        # an embedding layer that converts character ids into embeddings
        self.embed_x = Embedding(n_tokens, EMB_SIZE)
        get_h_next = Dense(1024, activation='relu')
        # a dense layer that maps current hidden state
        # to probabilities of characters [h_t+1]->P(x_t+1|h_t+1)
        self.get_probas = Dense(n_tokens, activation='softmax')

        self.input_sequence = tf.placeholder('int32', (MAX_LENGTH, None))
        batch_size = tf.shape(self.input_sequence)[1]

        self.gru_cell_first = tf.nn.rnn_cell.GRUCell(RNN_NUM_UNITS)
        self.lstm_cell_second = tf.nn.rnn_cell.LSTMCell(RNN_NUM_UNITS)

        h_prev_first = self.gru_cell_first.zero_state(batch_size, dtype=tf.float32)
        h_prev_second = tf.nn.rnn_cell.LSTMStateTuple(
            tf.zeros([batch_size, RNN_NUM_UNITS]),  # initial cell state,
            tf.zeros([batch_size, RNN_NUM_UNITS])  # initial hidden state
        )

        predicted_probas = []
        for t in range(MAX_LENGTH):
            x_t = self.input_sequence[t]
            # convert character id into embedding
            x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]

            out_next_first, h_next_first = self.gru_cell_first(x_t_emb, h_prev_first)
            h_prev_first = h_next_first

            out_next_second, h_next_second = self.lstm_cell_second(out_next_first, h_prev_second)
            h_prev_second = h_next_second

            probas_next = self.get_probas(out_next_second)
            predicted_probas.append(probas_next)

        predicted_probas = tf.stack(predicted_probas)

        predictions_matrix = tf.reshape(predicted_probas[:-1], [-1, len(tokens)])
        answers_matrix = tf.one_hot(tf.reshape(self.input_sequence[1:], [-1]), n_tokens)

        self.loss = tf.reduce_mean(tf.reduce_sum(
            -answers_matrix * tf.log(tf.clip_by_value(predictions_matrix, 1e-7, 1.0)),
            reduction_indices=[1]
        ))
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(self.loss)
        capped_gvs = [(gr if gr is None else tf.clip_by_value(gr, -1., 1.), var) for gr, var in gvs]
        self.optimize = optimizer.apply_gradients(capped_gvs)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

    def train(self, train_data_path, save_dir, num_iters, batch_size=64, restore_from=False):
        history = []
        if restore_from:
            with open(restore_from + '_history') as f:
                history = pickle.load(f)
            self.saver.restore(self.sess, restore_from)
        with open(train_data_path, 'r') as f:
            train_data = f.readlines()

        train_data = filter(lambda a: len(a) < MAX_LENGTH, train_data)

        for i in tqdm(range(num_iters)):
            batch = to_matrix(
                map(lambda a: '\n' + a.rstrip('\n'), sample(train_data, batch_size)),
                max_len=MAX_LENGTH
            )
            loss_i, _ = self.sess.run([self.loss, self.optimize], {self.input_sequence: batch})
            history.append(loss_i)
            if len(history) % 2000 == 0:
                self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
        self.saver.save(self.sess, os.path.join(save_dir, '{}_iters'.format(len(history))))
        with open(os.path.join(save_dir, '{}_iters_history'.format(len(history)))) as f:
            pickle.dump(history, f)

    def generate(self, num_objects, output_file, weights_path):
        self.saver.restore(self.sess, weights_path)
        batch_size = num_objects
        x_t = tf.placeholder('int32', (None, batch_size))
        h_t_first = tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
        h_t_second = tf.nn.rnn_cell.LSTMStateTuple(
            tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS])),
            tf.Variable(tf.zeros([batch_size, RNN_NUM_UNITS]))
        )

        x_t_emb = self.embed_x(tf.reshape(x_t, [-1, 1]))[:, 0]
        first_out_next, next_h_first = self.gru_cell_first(x_t_emb, h_t_first)
        second_out_next, next_h_second = self.lstm_cell_second(first_out_next, h_t_second)
        next_probs = self.get_probas(second_out_next)

        x_sequence = np.zeros(shape=(1, batch_size), dtype=int) + token_to_id['\n']
        self.sess.run(
            [tf.assign(h_t_first, h_t_first.initial_value),
             tf.assign(h_t_second[0], h_t_second[0].initial_value),
             tf.assign(h_t_second[1], h_t_second[1].initial_value)]
        )

        for i in tqdm(range(MAX_LENGTH - 1)):
            x_probs, _, _, _ = self.sess.run(
                [next_probs,
                 tf.assign(h_t_second[0], next_h_second[0]),
                 tf.assign(h_t_second[1], next_h_second[1]),
                 tf.assign(h_t_first, next_h_first)],
                {x_t: [x_sequence[-1, :]]}
            )

            next_char = [np.random.choice(n_tokens, p=x_probs[i]) for i in range(batch_size)]
            if sum(next_char) == 0:
                break
            x_sequence = np.append(x_sequence, [next_char], axis=0)

        with open(output_file, 'w') as f:
            f.writelines([''.join([tokens[ix] for ix in x_sequence.T[k]]) + '\n' for k in range(batch_size)])