mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Merge pull request #29362 from ClickHouse/remove-obsolete-non-automated-tests
Remove obsolete non-automated tests
This commit is contained in:
commit
7eddf2664e
@ -1,17 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
|
||||||
|
|
||||||
cd $DIR
|
|
||||||
git clone https://github.com/catboost/catboost.git
|
|
||||||
|
|
||||||
|
|
||||||
cd "${DIR}/catboost/catboost/libs/model_interface"
|
|
||||||
../../../ya make -r -o "${DIR}/build/lib" -j4
|
|
||||||
cd $DIR
|
|
||||||
ln -sf "${DIR}/build/lib/catboost/libs/model_interface/libcatboostmodel.so" libcatboostmodel.so
|
|
||||||
|
|
||||||
cd "${DIR}/catboost/catboost/python-package/catboost"
|
|
||||||
../../../ya make -r -DUSE_ARCADIA_PYTHON=no -DOS_SDK=local -DPYTHON_CONFIG=python2-config -j4
|
|
||||||
cd $DIR
|
|
||||||
ln -sf "${DIR}/catboost/catboost/python-package" python-package
|
|
@ -1,42 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import threading
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
class ClickHouseClient:
|
|
||||||
def __init__(self, binary_path, port):
|
|
||||||
self.binary_path = binary_path
|
|
||||||
self.port = port
|
|
||||||
|
|
||||||
def query(self, query, timeout=10, pipe=None):
|
|
||||||
|
|
||||||
result = []
|
|
||||||
process = []
|
|
||||||
|
|
||||||
def run(path, port, text, result, in_pipe, process):
|
|
||||||
|
|
||||||
if in_pipe is None:
|
|
||||||
in_pipe = subprocess.PIPE
|
|
||||||
|
|
||||||
pipe = subprocess.Popen([path, 'client', '--port', str(port), '-q', text],
|
|
||||||
stdin=in_pipe, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
|
||||||
process.append(pipe)
|
|
||||||
stdout_data, stderr_data = pipe.communicate()
|
|
||||||
|
|
||||||
if stderr_data:
|
|
||||||
raise Exception('Error while executing query: {}\nstdout:\n{}\nstderr:\n{}'
|
|
||||||
.format(text, stdout_data, stderr_data))
|
|
||||||
|
|
||||||
result.append(stdout_data)
|
|
||||||
|
|
||||||
thread = threading.Thread(target=run, args=(self.binary_path, self.port, query, result, pipe, process))
|
|
||||||
thread.start()
|
|
||||||
thread.join(timeout)
|
|
||||||
if thread.isAlive():
|
|
||||||
if len(process):
|
|
||||||
process[0].kill()
|
|
||||||
thread.join()
|
|
||||||
raise Exception('timeout exceed for query: ' + query)
|
|
||||||
|
|
||||||
if len(result):
|
|
||||||
return result[0]
|
|
@ -1,15 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def generate_uniform_int_column(size, low, high, seed=0):
|
|
||||||
np.random.seed(seed)
|
|
||||||
return np.random.randint(low, high, size)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_uniform_float_column(size, low, high, seed=0):
|
|
||||||
np.random.seed(seed)
|
|
||||||
return np.random.random(size) * (high - low) + low
|
|
||||||
|
|
||||||
|
|
||||||
def generate_uniform_string_column(size, samples, seed):
|
|
||||||
return np.array(samples)[generate_uniform_int_column(size, 0, len(samples), seed)]
|
|
@ -1,67 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import threading
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
class ClickHouseServer:
|
|
||||||
def __init__(self, binary_path, config_path, stdout_file=None, stderr_file=None, shutdown_timeout=10):
|
|
||||||
self.binary_path = binary_path
|
|
||||||
self.config_path = config_path
|
|
||||||
self.pipe = None
|
|
||||||
self.stdout_file = stdout_file
|
|
||||||
self.stderr_file = stderr_file
|
|
||||||
self.shutdown_timeout = shutdown_timeout
|
|
||||||
|
|
||||||
def start(self):
|
|
||||||
cmd = [self.binary_path, 'server', '--config', self.config_path]
|
|
||||||
out_pipe = None
|
|
||||||
err_pipe = None
|
|
||||||
if self.stdout_file is not None:
|
|
||||||
out_pipe = open(self.stdout_file, 'w')
|
|
||||||
if self.stderr_file is not None:
|
|
||||||
err_pipe = open(self.stderr_file, 'w')
|
|
||||||
self.pipe = subprocess.Popen(cmd, stdout=out_pipe, stderr=err_pipe)
|
|
||||||
|
|
||||||
def wait_for_request(self, port, timeout=1):
|
|
||||||
try:
|
|
||||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
||||||
# is not working
|
|
||||||
# s.settimeout(timeout)
|
|
||||||
|
|
||||||
step = 0.01
|
|
||||||
for iter in range(int(timeout / step)):
|
|
||||||
if s.connect_ex(('localhost', port)) == 0:
|
|
||||||
return
|
|
||||||
time.sleep(step)
|
|
||||||
|
|
||||||
s.connect(('localhost', port))
|
|
||||||
except socket.error as socketerror:
|
|
||||||
print("Error: ", socketerror)
|
|
||||||
raise
|
|
||||||
|
|
||||||
def shutdown(self, timeout=10):
|
|
||||||
|
|
||||||
def wait(pipe):
|
|
||||||
pipe.wait()
|
|
||||||
|
|
||||||
if self.pipe is not None:
|
|
||||||
self.pipe.terminate()
|
|
||||||
thread = threading.Thread(target=wait, args=(self.pipe,))
|
|
||||||
thread.start()
|
|
||||||
thread.join(timeout)
|
|
||||||
if thread.isAlive():
|
|
||||||
self.pipe.kill()
|
|
||||||
thread.join()
|
|
||||||
|
|
||||||
if self.pipe.stdout is not None:
|
|
||||||
self.pipe.stdout.close()
|
|
||||||
if self.pipe.stderr is not None:
|
|
||||||
self.pipe.stderr.close()
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self.start()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, type, value, traceback):
|
|
||||||
self.shutdown(self.shutdown_timeout)
|
|
@ -1,168 +0,0 @@
|
|||||||
from .server import ClickHouseServer
|
|
||||||
from .client import ClickHouseClient
|
|
||||||
from .table import ClickHouseTable
|
|
||||||
import os
|
|
||||||
import errno
|
|
||||||
from shutil import rmtree
|
|
||||||
|
|
||||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
CATBOOST_ROOT = os.path.dirname(SCRIPT_DIR)
|
|
||||||
|
|
||||||
CLICKHOUSE_CONFIG = \
|
|
||||||
'''
|
|
||||||
<yandex>
|
|
||||||
<timezone>Europe/Moscow</timezone>
|
|
||||||
<listen_host>::</listen_host>
|
|
||||||
<path>{path}</path>
|
|
||||||
<tmp_path>{tmp_path}</tmp_path>
|
|
||||||
<models_config>{models_config}</models_config>
|
|
||||||
<mark_cache_size>5368709120</mark_cache_size>
|
|
||||||
<users_config>users.xml</users_config>
|
|
||||||
<tcp_port>{tcp_port}</tcp_port>
|
|
||||||
<catboost_dynamic_library_path>{catboost_dynamic_library_path}</catboost_dynamic_library_path>
|
|
||||||
<logger>
|
|
||||||
<level>trace</level>
|
|
||||||
<log>{path}/clickhouse-server.log</log>
|
|
||||||
<errorlog>{path}/clickhouse-server.err.log</errorlog>
|
|
||||||
<size>never</size>
|
|
||||||
<count>50</count>
|
|
||||||
</logger>
|
|
||||||
</yandex>
|
|
||||||
'''
|
|
||||||
|
|
||||||
CLICKHOUSE_USERS = \
|
|
||||||
'''
|
|
||||||
<yandex>
|
|
||||||
<profiles>
|
|
||||||
<default>
|
|
||||||
</default>
|
|
||||||
<readonly>
|
|
||||||
<readonly>1</readonly>
|
|
||||||
</readonly>
|
|
||||||
</profiles>
|
|
||||||
|
|
||||||
<users>
|
|
||||||
<readonly>
|
|
||||||
<password></password>
|
|
||||||
<profile>readonly</profile>
|
|
||||||
<quota>default</quota>
|
|
||||||
</readonly>
|
|
||||||
|
|
||||||
<default>
|
|
||||||
<password></password>
|
|
||||||
<profile>default</profile>
|
|
||||||
<quota>default</quota>
|
|
||||||
<networks incl="networks" replace="replace">
|
|
||||||
<ip>::1</ip>
|
|
||||||
<ip>127.0.0.1</ip>
|
|
||||||
</networks>
|
|
||||||
|
|
||||||
</default>
|
|
||||||
</users>
|
|
||||||
|
|
||||||
<quotas>
|
|
||||||
<default>
|
|
||||||
</default>
|
|
||||||
</quotas>
|
|
||||||
</yandex>
|
|
||||||
'''
|
|
||||||
|
|
||||||
CATBOOST_MODEL_CONFIG = \
|
|
||||||
'''
|
|
||||||
<models>
|
|
||||||
<model>
|
|
||||||
<type>catboost</type>
|
|
||||||
<name>{name}</name>
|
|
||||||
<path>{path}</path>
|
|
||||||
<lifetime>0</lifetime>
|
|
||||||
</model>
|
|
||||||
</models>
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
class ClickHouseServerWithCatboostModels:
|
|
||||||
def __init__(self, name, binary_path, port, shutdown_timeout=10, clean_folder=False):
|
|
||||||
self.models = {}
|
|
||||||
self.name = name
|
|
||||||
self.binary_path = binary_path
|
|
||||||
self.port = port
|
|
||||||
self.shutdown_timeout = shutdown_timeout
|
|
||||||
self.clean_folder = clean_folder
|
|
||||||
self.root = os.path.join(CATBOOST_ROOT, 'data', 'servers')
|
|
||||||
self.config_path = os.path.join(self.root, 'config.xml')
|
|
||||||
self.users_path = os.path.join(self.root, 'users.xml')
|
|
||||||
self.models_dir = os.path.join(self.root, 'models')
|
|
||||||
self.server = None
|
|
||||||
|
|
||||||
def _get_server(self):
|
|
||||||
stdout_file = os.path.join(self.root, 'server_stdout.txt')
|
|
||||||
stderr_file = os.path.join(self.root, 'server_stderr.txt')
|
|
||||||
return ClickHouseServer(self.binary_path, self.config_path, stdout_file, stderr_file, self.shutdown_timeout)
|
|
||||||
|
|
||||||
def add_model(self, model_name, model):
|
|
||||||
self.models[model_name] = model
|
|
||||||
|
|
||||||
def apply_model(self, name, df, cat_feature_names):
|
|
||||||
names = list(df)
|
|
||||||
float_feature_names = tuple(name for name in names if name not in cat_feature_names)
|
|
||||||
with ClickHouseTable(self.server, self.port, name, df) as table:
|
|
||||||
return table.apply_model(name, cat_feature_names, float_feature_names)
|
|
||||||
|
|
||||||
def _create_root(self):
|
|
||||||
try:
|
|
||||||
os.makedirs(self.root)
|
|
||||||
except OSError as exc: # Python >2.5
|
|
||||||
if exc.errno == errno.EEXIST and os.path.isdir(self.root):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _clean_root(self):
|
|
||||||
rmtree(self.root)
|
|
||||||
|
|
||||||
def _save_config(self):
|
|
||||||
params = {
|
|
||||||
'tcp_port': self.port,
|
|
||||||
'path': os.path.join(self.root, 'clickhouse'),
|
|
||||||
'tmp_path': os.path.join(self.root, 'clickhouse', 'tmp'),
|
|
||||||
'models_config': os.path.join(self.models_dir, '*_model.xml'),
|
|
||||||
'catboost_dynamic_library_path': os.path.join(CATBOOST_ROOT, 'data', 'libcatboostmodel.so')
|
|
||||||
}
|
|
||||||
config = CLICKHOUSE_CONFIG.format(**params)
|
|
||||||
|
|
||||||
with open(self.config_path, 'w') as f:
|
|
||||||
f.write(config)
|
|
||||||
|
|
||||||
with open(self.users_path, 'w') as f:
|
|
||||||
f.write(CLICKHOUSE_USERS)
|
|
||||||
|
|
||||||
def _save_models(self):
|
|
||||||
if not os.path.exists(self.models_dir):
|
|
||||||
os.makedirs(self.models_dir)
|
|
||||||
|
|
||||||
for name, model in list(self.models.items()):
|
|
||||||
model_path = os.path.join(self.models_dir, name + '.cbm')
|
|
||||||
config_path = os.path.join(self.models_dir, name + '_model.xml')
|
|
||||||
params = {
|
|
||||||
'name': name,
|
|
||||||
'path': model_path
|
|
||||||
}
|
|
||||||
config = CATBOOST_MODEL_CONFIG.format(**params)
|
|
||||||
with open(config_path, 'w') as f:
|
|
||||||
f.write(config)
|
|
||||||
|
|
||||||
model.save_model(model_path)
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self._create_root()
|
|
||||||
self._save_config()
|
|
||||||
self._save_models()
|
|
||||||
self.server = self._get_server().__enter__()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
res = self.server.__exit__(exc_type, exc_val, exc_tb)
|
|
||||||
if self.clean_folder:
|
|
||||||
self._clean_root()
|
|
||||||
return res
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
|||||||
from .server import ClickHouseServer
|
|
||||||
from .client import ClickHouseClient
|
|
||||||
from pandas import DataFrame
|
|
||||||
import os
|
|
||||||
import threading
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
|
|
||||||
class ClickHouseTable:
|
|
||||||
def __init__(self, server, port, table_name, df):
|
|
||||||
self.server = server
|
|
||||||
self.port = port
|
|
||||||
self.table_name = table_name
|
|
||||||
self.df = df
|
|
||||||
|
|
||||||
if not isinstance(self.server, ClickHouseServer):
|
|
||||||
raise Exception('Expected ClickHouseServer, got ' + repr(self.server))
|
|
||||||
if not isinstance(self.df, DataFrame):
|
|
||||||
raise Exception('Expected DataFrame, got ' + repr(self.df))
|
|
||||||
|
|
||||||
self.server.wait_for_request(port)
|
|
||||||
self.client = ClickHouseClient(server.binary_path, port)
|
|
||||||
|
|
||||||
def _convert(self, name):
|
|
||||||
types_map = {
|
|
||||||
'float64': 'Float64',
|
|
||||||
'int64': 'Int64',
|
|
||||||
'float32': 'Float32',
|
|
||||||
'int32': 'Int32'
|
|
||||||
}
|
|
||||||
|
|
||||||
if name in types_map:
|
|
||||||
return types_map[name]
|
|
||||||
return 'String'
|
|
||||||
|
|
||||||
def _create_table_from_df(self):
|
|
||||||
self.client.query('create database if not exists test')
|
|
||||||
self.client.query('drop table if exists test.{}'.format(self.table_name))
|
|
||||||
|
|
||||||
column_types = list(self.df.dtypes)
|
|
||||||
column_names = list(self.df)
|
|
||||||
schema = ', '.join((name + ' ' + self._convert(str(t)) for name, t in zip(column_names, column_types)))
|
|
||||||
print('schema:', schema)
|
|
||||||
|
|
||||||
create_query = 'create table test.{} (date Date DEFAULT today(), {}) engine = MergeTree(date, (date), 8192)'
|
|
||||||
self.client.query(create_query.format(self.table_name, schema))
|
|
||||||
|
|
||||||
insert_query = 'insert into test.{} ({}) format CSV'
|
|
||||||
|
|
||||||
with tempfile.TemporaryFile() as tmp_file:
|
|
||||||
self.df.to_csv(tmp_file, header=False, index=False)
|
|
||||||
tmp_file.seek(0)
|
|
||||||
self.client.query(insert_query.format(self.table_name, ', '.join(column_names)), pipe=tmp_file)
|
|
||||||
|
|
||||||
def apply_model(self, model_name, float_columns, cat_columns):
|
|
||||||
columns = ', '.join(list(float_columns) + list(cat_columns))
|
|
||||||
query = "select modelEvaluate('{}', {}) from test.{} format TSV"
|
|
||||||
result = self.client.query(query.format(model_name, columns, self.table_name))
|
|
||||||
|
|
||||||
def parse_row(row):
|
|
||||||
values = tuple(map(float, list(filter(len, list(map(str.strip, row.replace('(', '').replace(')', '').split(',')))))))
|
|
||||||
return values if len(values) != 1 else values[0]
|
|
||||||
|
|
||||||
return tuple(map(parse_row, list(filter(len, list(map(str.strip, result.split('\n')))))))
|
|
||||||
|
|
||||||
def _drop_table(self):
|
|
||||||
self.client.query('drop table test.{}'.format(self.table_name))
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self._create_table_from_df()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, type, value, traceback):
|
|
||||||
self._drop_table()
|
|
@ -1,28 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
CATBOOST_ROOT = os.path.dirname(SCRIPT_DIR)
|
|
||||||
CATBOOST_PYTHON_DIR = os.path.join(CATBOOST_ROOT, 'data', 'python-package')
|
|
||||||
|
|
||||||
if CATBOOST_PYTHON_DIR not in sys.path:
|
|
||||||
sys.path.append(CATBOOST_PYTHON_DIR)
|
|
||||||
|
|
||||||
|
|
||||||
import catboost
|
|
||||||
from catboost import CatBoostClassifier
|
|
||||||
|
|
||||||
|
|
||||||
def train_catboost_model(df, target, cat_features, params, verbose=True):
|
|
||||||
|
|
||||||
if not isinstance(df, DataFrame):
|
|
||||||
raise Exception('DataFrame object expected, but got ' + repr(df))
|
|
||||||
|
|
||||||
print('features:', df.columns.tolist())
|
|
||||||
|
|
||||||
cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features)
|
|
||||||
print('cat features:', cat_features_index)
|
|
||||||
model = CatBoostClassifier(**params)
|
|
||||||
model.fit(df, target, cat_features=cat_features_index, verbose=verbose)
|
|
||||||
return model
|
|
@ -1,3 +0,0 @@
|
|||||||
[pytest]
|
|
||||||
python_files = test.py
|
|
||||||
norecursedirs=data
|
|
@ -1,294 +0,0 @@
|
|||||||
from helpers.server_with_models import ClickHouseServerWithCatboostModels
|
|
||||||
from helpers.generate import generate_uniform_string_column, generate_uniform_float_column, generate_uniform_int_column
|
|
||||||
from helpers.train import train_catboost_model
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
|
|
||||||
PORT = int(os.environ.get('CLICKHOUSE_TESTS_PORT', '9000'))
|
|
||||||
CLICKHOUSE_TESTS_SERVER_BIN_PATH = os.environ.get('CLICKHOUSE_TESTS_SERVER_BIN_PATH', '/usr/bin/clickhouse')
|
|
||||||
|
|
||||||
|
|
||||||
def add_noise_to_target(target, seed, threshold=0.05):
|
|
||||||
col = generate_uniform_float_column(len(target), 0., 1., seed + 1) < threshold
|
|
||||||
return target * (1 - col) + (1 - target) * col
|
|
||||||
|
|
||||||
|
|
||||||
def check_predictions(test_name, target, pred_python, pred_ch, acc_threshold):
|
|
||||||
ch_class = pred_ch.astype(int)
|
|
||||||
python_class = pred_python.astype(int)
|
|
||||||
if not np.array_equal(ch_class, python_class):
|
|
||||||
raise Exception('Got different results:\npython:\n' + str(python_class) + '\nClickHouse:\n' + str(ch_class))
|
|
||||||
|
|
||||||
acc = 1 - np.sum(np.abs(ch_class - np.array(target))) / (len(target) + .0)
|
|
||||||
assert acc >= acc_threshold
|
|
||||||
print(test_name, 'accuracy: {:.10f}'.format(acc))
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_float_features_only():
|
|
||||||
|
|
||||||
name = 'test_apply_float_features_only'
|
|
||||||
|
|
||||||
train_size = 10000
|
|
||||||
test_size = 10000
|
|
||||||
|
|
||||||
def gen_data(size, seed):
|
|
||||||
data = {
|
|
||||||
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
||||||
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
||||||
'c': generate_uniform_float_column(size, 0., 1., seed + 3)
|
|
||||||
}
|
|
||||||
return DataFrame.from_dict(data)
|
|
||||||
|
|
||||||
def get_target(df):
|
|
||||||
def target_filter(row):
|
|
||||||
return 1 if (row['a'] > .3 and row['b'] > .3) or (row['c'] < .4 and row['a'] * row['b'] > 0.1) else 0
|
|
||||||
return df.apply(target_filter, axis=1).as_matrix()
|
|
||||||
|
|
||||||
train_df = gen_data(train_size, 42)
|
|
||||||
test_df = gen_data(test_size, 43)
|
|
||||||
|
|
||||||
train_target = get_target(train_df)
|
|
||||||
test_target = get_target(test_df)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print('train target', train_target)
|
|
||||||
print('test target', test_target)
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'iterations': 4,
|
|
||||||
'depth': 2,
|
|
||||||
'learning_rate': 1,
|
|
||||||
'loss_function': 'Logloss'
|
|
||||||
}
|
|
||||||
|
|
||||||
model = train_catboost_model(train_df, train_target, [], params)
|
|
||||||
pred_python = model.predict(test_df)
|
|
||||||
|
|
||||||
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
||||||
server.add_model(name, model)
|
|
||||||
with server:
|
|
||||||
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
||||||
|
|
||||||
print('python predictions', pred_python)
|
|
||||||
print('clickhouse predictions', pred_ch)
|
|
||||||
|
|
||||||
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_float_features_with_string_cat_features():
|
|
||||||
|
|
||||||
name = 'test_apply_float_features_with_string_cat_features'
|
|
||||||
|
|
||||||
train_size = 10000
|
|
||||||
test_size = 10000
|
|
||||||
|
|
||||||
def gen_data(size, seed):
|
|
||||||
data = {
|
|
||||||
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
||||||
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
||||||
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
||||||
'd': generate_uniform_string_column(size, ['e', 'f', 'g'], seed + 4)
|
|
||||||
}
|
|
||||||
return DataFrame.from_dict(data)
|
|
||||||
|
|
||||||
def get_target(df):
|
|
||||||
def target_filter(row):
|
|
||||||
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
|
|
||||||
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 'e') else 0
|
|
||||||
return df.apply(target_filter, axis=1).as_matrix()
|
|
||||||
|
|
||||||
train_df = gen_data(train_size, 42)
|
|
||||||
test_df = gen_data(test_size, 43)
|
|
||||||
|
|
||||||
train_target = get_target(train_df)
|
|
||||||
test_target = get_target(test_df)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print('train target', train_target)
|
|
||||||
print('test target', test_target)
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'iterations': 6,
|
|
||||||
'depth': 2,
|
|
||||||
'learning_rate': 1,
|
|
||||||
'loss_function': 'Logloss'
|
|
||||||
}
|
|
||||||
|
|
||||||
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
||||||
pred_python = model.predict(test_df)
|
|
||||||
|
|
||||||
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
||||||
server.add_model(name, model)
|
|
||||||
with server:
|
|
||||||
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
||||||
|
|
||||||
print('python predictions', pred_python)
|
|
||||||
print('clickhouse predictions', pred_ch)
|
|
||||||
|
|
||||||
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_float_features_with_int_cat_features():
|
|
||||||
|
|
||||||
name = 'test_apply_float_features_with_int_cat_features'
|
|
||||||
|
|
||||||
train_size = 10000
|
|
||||||
test_size = 10000
|
|
||||||
|
|
||||||
def gen_data(size, seed):
|
|
||||||
data = {
|
|
||||||
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
||||||
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
||||||
'c': generate_uniform_int_column(size, 1, 4, seed + 3),
|
|
||||||
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
||||||
}
|
|
||||||
return DataFrame.from_dict(data)
|
|
||||||
|
|
||||||
def get_target(df):
|
|
||||||
def target_filter(row):
|
|
||||||
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 1) \
|
|
||||||
or (row['a'] * row['b'] > 0.1 and row['c'] != 2 and row['d'] != 3) else 0
|
|
||||||
return df.apply(target_filter, axis=1).as_matrix()
|
|
||||||
|
|
||||||
train_df = gen_data(train_size, 42)
|
|
||||||
test_df = gen_data(test_size, 43)
|
|
||||||
|
|
||||||
train_target = get_target(train_df)
|
|
||||||
test_target = get_target(test_df)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print('train target', train_target)
|
|
||||||
print('test target', test_target)
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'iterations': 6,
|
|
||||||
'depth': 4,
|
|
||||||
'learning_rate': 1,
|
|
||||||
'loss_function': 'Logloss'
|
|
||||||
}
|
|
||||||
|
|
||||||
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
||||||
pred_python = model.predict(test_df)
|
|
||||||
|
|
||||||
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
||||||
server.add_model(name, model)
|
|
||||||
with server:
|
|
||||||
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
||||||
|
|
||||||
print('python predictions', pred_python)
|
|
||||||
print('clickhouse predictions', pred_ch)
|
|
||||||
|
|
||||||
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_float_features_with_mixed_cat_features():
|
|
||||||
|
|
||||||
name = 'test_apply_float_features_with_mixed_cat_features'
|
|
||||||
|
|
||||||
train_size = 10000
|
|
||||||
test_size = 10000
|
|
||||||
|
|
||||||
def gen_data(size, seed):
|
|
||||||
data = {
|
|
||||||
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
||||||
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
||||||
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
||||||
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
||||||
}
|
|
||||||
return DataFrame.from_dict(data)
|
|
||||||
|
|
||||||
def get_target(df):
|
|
||||||
def target_filter(row):
|
|
||||||
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
|
|
||||||
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2) else 0
|
|
||||||
return df.apply(target_filter, axis=1).as_matrix()
|
|
||||||
|
|
||||||
train_df = gen_data(train_size, 42)
|
|
||||||
test_df = gen_data(test_size, 43)
|
|
||||||
|
|
||||||
train_target = get_target(train_df)
|
|
||||||
test_target = get_target(test_df)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print('train target', train_target)
|
|
||||||
print('test target', test_target)
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'iterations': 6,
|
|
||||||
'depth': 4,
|
|
||||||
'learning_rate': 1,
|
|
||||||
'loss_function': 'Logloss'
|
|
||||||
}
|
|
||||||
|
|
||||||
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
||||||
pred_python = model.predict(test_df)
|
|
||||||
|
|
||||||
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
||||||
server.add_model(name, model)
|
|
||||||
with server:
|
|
||||||
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
|
|
||||||
|
|
||||||
print('python predictions', pred_python)
|
|
||||||
print('clickhouse predictions', pred_ch)
|
|
||||||
|
|
||||||
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_multiclass():
|
|
||||||
|
|
||||||
name = 'test_apply_float_features_with_mixed_cat_features'
|
|
||||||
|
|
||||||
train_size = 10000
|
|
||||||
test_size = 10000
|
|
||||||
|
|
||||||
def gen_data(size, seed):
|
|
||||||
data = {
|
|
||||||
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
|
|
||||||
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
|
|
||||||
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
|
|
||||||
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
|
|
||||||
}
|
|
||||||
return DataFrame.from_dict(data)
|
|
||||||
|
|
||||||
def get_target(df):
|
|
||||||
def target_filter(row):
|
|
||||||
if row['a'] > .3 and row['b'] > .3 and row['c'] != 'a':
|
|
||||||
return 0
|
|
||||||
elif row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 2
|
|
||||||
|
|
||||||
return df.apply(target_filter, axis=1).as_matrix()
|
|
||||||
|
|
||||||
train_df = gen_data(train_size, 42)
|
|
||||||
test_df = gen_data(test_size, 43)
|
|
||||||
|
|
||||||
train_target = get_target(train_df)
|
|
||||||
test_target = get_target(test_df)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print('train target', train_target)
|
|
||||||
print('test target', test_target)
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'iterations': 10,
|
|
||||||
'depth': 4,
|
|
||||||
'learning_rate': 1,
|
|
||||||
'loss_function': 'MultiClass'
|
|
||||||
}
|
|
||||||
|
|
||||||
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
|
|
||||||
pred_python = model.predict(test_df)[:,0].astype(int)
|
|
||||||
|
|
||||||
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
|
|
||||||
server.add_model(name, model)
|
|
||||||
with server:
|
|
||||||
pred_ch = np.argmax(np.array(server.apply_model(name, test_df, [])), axis=1)
|
|
||||||
|
|
||||||
print('python predictions', pred_python)
|
|
||||||
print('clickhouse predictions', pred_ch)
|
|
||||||
|
|
||||||
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
|
|
Loading…
Reference in New Issue
Block a user