mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-14 03:25:15 +00:00
97f2a2213e
* Move some code outside dbms/src folder * Fix paths
295 lines
9.5 KiB
Python
295 lines
9.5 KiB
Python
from helpers.server_with_models import ClickHouseServerWithCatboostModels
|
|
from helpers.generate import generate_uniform_string_column, generate_uniform_float_column, generate_uniform_int_column
|
|
from helpers.train import train_catboost_model
|
|
import os
|
|
import numpy as np
|
|
from pandas import DataFrame
|
|
|
|
|
|
PORT = int(os.environ.get('CLICKHOUSE_TESTS_PORT', '9000'))
|
|
CLICKHOUSE_TESTS_SERVER_BIN_PATH = os.environ.get('CLICKHOUSE_TESTS_SERVER_BIN_PATH', '/usr/bin/clickhouse')
|
|
|
|
|
|
def add_noise_to_target(target, seed, threshold=0.05):
|
|
col = generate_uniform_float_column(len(target), 0., 1., seed + 1) < threshold
|
|
return target * (1 - col) + (1 - target) * col
|
|
|
|
|
|
def check_predictions(test_name, target, pred_python, pred_ch, acc_threshold):
|
|
ch_class = pred_ch.astype(int)
|
|
python_class = pred_python.astype(int)
|
|
if not np.array_equal(ch_class, python_class):
|
|
raise Exception('Got different results:\npython:\n' + str(python_class) + '\nClickHouse:\n' + str(ch_class))
|
|
|
|
acc = 1 - np.sum(np.abs(ch_class - np.array(target))) / (len(target) + .0)
|
|
assert acc >= acc_threshold
|
|
print test_name, 'accuracy: {:.10f}'.format(acc)
|
|
|
|
|
|
def test_apply_float_features_only():
|
|
|
|
name = 'test_apply_float_features_only'
|
|
|
|
train_size = 10000
|
|
test_size = 10000
|
|
|
|
def gen_data(size, seed):
|
|
data = {
|
|
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
'c': generate_uniform_float_column(size, 0., 1., seed + 3)
|
|
}
|
|
return DataFrame.from_dict(data)
|
|
|
|
def get_target(df):
|
|
def target_filter(row):
|
|
return 1 if (row['a'] > .3 and row['b'] > .3) or (row['c'] < .4 and row['a'] * row['b'] > 0.1) else 0
|
|
return df.apply(target_filter, axis=1).as_matrix()
|
|
|
|
train_df = gen_data(train_size, 42)
|
|
test_df = gen_data(test_size, 43)
|
|
|
|
train_target = get_target(train_df)
|
|
test_target = get_target(test_df)
|
|
|
|
print
|
|
print 'train target', train_target
|
|
print 'test target', test_target
|
|
|
|
params = {
|
|
'iterations': 4,
|
|
'depth': 2,
|
|
'learning_rate': 1,
|
|
'loss_function': 'Logloss'
|
|
}
|
|
|
|
model = train_catboost_model(train_df, train_target, [], params)
|
|
pred_python = model.predict(test_df)
|
|
|
|
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
server.add_model(name, model)
|
|
with server:
|
|
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
|
|
print 'python predictions', pred_python
|
|
print 'clickhouse predictions', pred_ch
|
|
|
|
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
|
|
|
|
def test_apply_float_features_with_string_cat_features():
|
|
|
|
name = 'test_apply_float_features_with_string_cat_features'
|
|
|
|
train_size = 10000
|
|
test_size = 10000
|
|
|
|
def gen_data(size, seed):
|
|
data = {
|
|
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
'd': generate_uniform_string_column(size, ['e', 'f', 'g'], seed + 4)
|
|
}
|
|
return DataFrame.from_dict(data)
|
|
|
|
def get_target(df):
|
|
def target_filter(row):
|
|
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
|
|
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 'e') else 0
|
|
return df.apply(target_filter, axis=1).as_matrix()
|
|
|
|
train_df = gen_data(train_size, 42)
|
|
test_df = gen_data(test_size, 43)
|
|
|
|
train_target = get_target(train_df)
|
|
test_target = get_target(test_df)
|
|
|
|
print
|
|
print 'train target', train_target
|
|
print 'test target', test_target
|
|
|
|
params = {
|
|
'iterations': 6,
|
|
'depth': 2,
|
|
'learning_rate': 1,
|
|
'loss_function': 'Logloss'
|
|
}
|
|
|
|
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
pred_python = model.predict(test_df)
|
|
|
|
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
server.add_model(name, model)
|
|
with server:
|
|
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
|
|
print 'python predictions', pred_python
|
|
print 'clickhouse predictions', pred_ch
|
|
|
|
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
|
|
|
|
def test_apply_float_features_with_int_cat_features():
|
|
|
|
name = 'test_apply_float_features_with_int_cat_features'
|
|
|
|
train_size = 10000
|
|
test_size = 10000
|
|
|
|
def gen_data(size, seed):
|
|
data = {
|
|
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
'c': generate_uniform_int_column(size, 1, 4, seed + 3),
|
|
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
}
|
|
return DataFrame.from_dict(data)
|
|
|
|
def get_target(df):
|
|
def target_filter(row):
|
|
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 1) \
|
|
or (row['a'] * row['b'] > 0.1 and row['c'] != 2 and row['d'] != 3) else 0
|
|
return df.apply(target_filter, axis=1).as_matrix()
|
|
|
|
train_df = gen_data(train_size, 42)
|
|
test_df = gen_data(test_size, 43)
|
|
|
|
train_target = get_target(train_df)
|
|
test_target = get_target(test_df)
|
|
|
|
print
|
|
print 'train target', train_target
|
|
print 'test target', test_target
|
|
|
|
params = {
|
|
'iterations': 6,
|
|
'depth': 4,
|
|
'learning_rate': 1,
|
|
'loss_function': 'Logloss'
|
|
}
|
|
|
|
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
pred_python = model.predict(test_df)
|
|
|
|
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
server.add_model(name, model)
|
|
with server:
|
|
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
|
|
print 'python predictions', pred_python
|
|
print 'clickhouse predictions', pred_ch
|
|
|
|
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
|
|
|
|
def test_apply_float_features_with_mixed_cat_features():
|
|
|
|
name = 'test_apply_float_features_with_mixed_cat_features'
|
|
|
|
train_size = 10000
|
|
test_size = 10000
|
|
|
|
def gen_data(size, seed):
|
|
data = {
|
|
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
}
|
|
return DataFrame.from_dict(data)
|
|
|
|
def get_target(df):
|
|
def target_filter(row):
|
|
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
|
|
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2) else 0
|
|
return df.apply(target_filter, axis=1).as_matrix()
|
|
|
|
train_df = gen_data(train_size, 42)
|
|
test_df = gen_data(test_size, 43)
|
|
|
|
train_target = get_target(train_df)
|
|
test_target = get_target(test_df)
|
|
|
|
print
|
|
print 'train target', train_target
|
|
print 'test target', test_target
|
|
|
|
params = {
|
|
'iterations': 6,
|
|
'depth': 4,
|
|
'learning_rate': 1,
|
|
'loss_function': 'Logloss'
|
|
}
|
|
|
|
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
pred_python = model.predict(test_df)
|
|
|
|
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
server.add_model(name, model)
|
|
with server:
|
|
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
|
|
print 'python predictions', pred_python
|
|
print 'clickhouse predictions', pred_ch
|
|
|
|
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
|
|
|
|
def test_apply_multiclass():
|
|
|
|
name = 'test_apply_float_features_with_mixed_cat_features'
|
|
|
|
train_size = 10000
|
|
test_size = 10000
|
|
|
|
def gen_data(size, seed):
|
|
data = {
|
|
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
}
|
|
return DataFrame.from_dict(data)
|
|
|
|
def get_target(df):
|
|
def target_filter(row):
|
|
if row['a'] > .3 and row['b'] > .3 and row['c'] != 'a':
|
|
return 0
|
|
elif row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2:
|
|
return 1
|
|
else:
|
|
return 2
|
|
|
|
return df.apply(target_filter, axis=1).as_matrix()
|
|
|
|
train_df = gen_data(train_size, 42)
|
|
test_df = gen_data(test_size, 43)
|
|
|
|
train_target = get_target(train_df)
|
|
test_target = get_target(test_df)
|
|
|
|
print
|
|
print 'train target', train_target
|
|
print 'test target', test_target
|
|
|
|
params = {
|
|
'iterations': 10,
|
|
'depth': 4,
|
|
'learning_rate': 1,
|
|
'loss_function': 'MultiClass'
|
|
}
|
|
|
|
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
pred_python = model.predict(test_df)[:,0].astype(int)
|
|
|
|
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
server.add_model(name, model)
|
|
with server:
|
|
pred_ch = np.argmax(np.array(server.apply_model(name, test_df, [])), axis=1)
|
|
|
|
print 'python predictions', pred_python
|
|
print 'clickhouse predictions', pred_ch
|
|
|
|
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|