ClickHouse/tests/external_models/catboost/test_apply_catboost_model/test.py
2020-10-02 19:54:07 +03:00

295 lines
9.5 KiB
Python

from helpers.server_with_models import ClickHouseServerWithCatboostModels
from helpers.generate import generate_uniform_string_column, generate_uniform_float_column, generate_uniform_int_column
from helpers.train import train_catboost_model
import os
import numpy as np
from pandas import DataFrame
PORT = int(os.environ.get('CLICKHOUSE_TESTS_PORT', '9000'))
CLICKHOUSE_TESTS_SERVER_BIN_PATH = os.environ.get('CLICKHOUSE_TESTS_SERVER_BIN_PATH', '/usr/bin/clickhouse')
def add_noise_to_target(target, seed, threshold=0.05):
col = generate_uniform_float_column(len(target), 0., 1., seed + 1) < threshold
return target * (1 - col) + (1 - target) * col
def check_predictions(test_name, target, pred_python, pred_ch, acc_threshold):
ch_class = pred_ch.astype(int)
python_class = pred_python.astype(int)
if not np.array_equal(ch_class, python_class):
raise Exception('Got different results:\npython:\n' + str(python_class) + '\nClickHouse:\n' + str(ch_class))
acc = 1 - np.sum(np.abs(ch_class - np.array(target))) / (len(target) + .0)
assert acc >= acc_threshold
print(test_name, 'accuracy: {:.10f}'.format(acc))
def test_apply_float_features_only():
name = 'test_apply_float_features_only'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_float_column(size, 0., 1., seed + 3)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3) or (row['c'] < .4 and row['a'] * row['b'] > 0.1) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 4,
'depth': 2,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, [], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_string_cat_features():
name = 'test_apply_float_features_with_string_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_string_column(size, ['e', 'f', 'g'], seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 'e') else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 2,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_int_cat_features():
name = 'test_apply_float_features_with_int_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_int_column(size, 1, 4, seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 1) \
or (row['a'] * row['b'] > 0.1 and row['c'] != 2 and row['d'] != 3) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 4,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_mixed_cat_features():
name = 'test_apply_float_features_with_mixed_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 4,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_multiclass():
name = 'test_apply_float_features_with_mixed_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
if row['a'] > .3 and row['b'] > .3 and row['c'] != 'a':
return 0
elif row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2:
return 1
else:
return 2
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 10,
'depth': 4,
'learning_rate': 1,
'loss_function': 'MultiClass'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)[:,0].astype(int)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = np.argmax(np.array(server.apply_model(name, test_df, [])), axis=1)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)