Merge pull request #29362 from ClickHouse/remove-obsolete-non-automated-tests

Remove obsolete non-automated tests
This commit is contained in:
alexey-milovidov 2021-09-25 19:40:03 +03:00 committed by GitHub
commit 7eddf2664e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 0 additions and 708 deletions

View File

@ -1,17 +0,0 @@
#!/usr/bin/env bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd $DIR
git clone https://github.com/catboost/catboost.git
cd "${DIR}/catboost/catboost/libs/model_interface"
../../../ya make -r -o "${DIR}/build/lib" -j4
cd $DIR
ln -sf "${DIR}/build/lib/catboost/libs/model_interface/libcatboostmodel.so" libcatboostmodel.so
cd "${DIR}/catboost/catboost/python-package/catboost"
../../../ya make -r -DUSE_ARCADIA_PYTHON=no -DOS_SDK=local -DPYTHON_CONFIG=python2-config -j4
cd $DIR
ln -sf "${DIR}/catboost/catboost/python-package" python-package

View File

@ -1,42 +0,0 @@
import subprocess
import threading
import os
class ClickHouseClient:
def __init__(self, binary_path, port):
self.binary_path = binary_path
self.port = port
def query(self, query, timeout=10, pipe=None):
result = []
process = []
def run(path, port, text, result, in_pipe, process):
if in_pipe is None:
in_pipe = subprocess.PIPE
pipe = subprocess.Popen([path, 'client', '--port', str(port), '-q', text],
stdin=in_pipe, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
process.append(pipe)
stdout_data, stderr_data = pipe.communicate()
if stderr_data:
raise Exception('Error while executing query: {}\nstdout:\n{}\nstderr:\n{}'
.format(text, stdout_data, stderr_data))
result.append(stdout_data)
thread = threading.Thread(target=run, args=(self.binary_path, self.port, query, result, pipe, process))
thread.start()
thread.join(timeout)
if thread.isAlive():
if len(process):
process[0].kill()
thread.join()
raise Exception('timeout exceed for query: ' + query)
if len(result):
return result[0]

View File

@ -1,15 +0,0 @@
import numpy as np
def generate_uniform_int_column(size, low, high, seed=0):
np.random.seed(seed)
return np.random.randint(low, high, size)
def generate_uniform_float_column(size, low, high, seed=0):
np.random.seed(seed)
return np.random.random(size) * (high - low) + low
def generate_uniform_string_column(size, samples, seed):
return np.array(samples)[generate_uniform_int_column(size, 0, len(samples), seed)]

View File

@ -1,67 +0,0 @@
import subprocess
import threading
import socket
import time
class ClickHouseServer:
def __init__(self, binary_path, config_path, stdout_file=None, stderr_file=None, shutdown_timeout=10):
self.binary_path = binary_path
self.config_path = config_path
self.pipe = None
self.stdout_file = stdout_file
self.stderr_file = stderr_file
self.shutdown_timeout = shutdown_timeout
def start(self):
cmd = [self.binary_path, 'server', '--config', self.config_path]
out_pipe = None
err_pipe = None
if self.stdout_file is not None:
out_pipe = open(self.stdout_file, 'w')
if self.stderr_file is not None:
err_pipe = open(self.stderr_file, 'w')
self.pipe = subprocess.Popen(cmd, stdout=out_pipe, stderr=err_pipe)
def wait_for_request(self, port, timeout=1):
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# is not working
# s.settimeout(timeout)
step = 0.01
for iter in range(int(timeout / step)):
if s.connect_ex(('localhost', port)) == 0:
return
time.sleep(step)
s.connect(('localhost', port))
except socket.error as socketerror:
print("Error: ", socketerror)
raise
def shutdown(self, timeout=10):
def wait(pipe):
pipe.wait()
if self.pipe is not None:
self.pipe.terminate()
thread = threading.Thread(target=wait, args=(self.pipe,))
thread.start()
thread.join(timeout)
if thread.isAlive():
self.pipe.kill()
thread.join()
if self.pipe.stdout is not None:
self.pipe.stdout.close()
if self.pipe.stderr is not None:
self.pipe.stderr.close()
def __enter__(self):
self.start()
return self
def __exit__(self, type, value, traceback):
self.shutdown(self.shutdown_timeout)

View File

@ -1,168 +0,0 @@
from .server import ClickHouseServer
from .client import ClickHouseClient
from .table import ClickHouseTable
import os
import errno
from shutil import rmtree
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CATBOOST_ROOT = os.path.dirname(SCRIPT_DIR)
CLICKHOUSE_CONFIG = \
'''
<yandex>
<timezone>Europe/Moscow</timezone>
<listen_host>::</listen_host>
<path>{path}</path>
<tmp_path>{tmp_path}</tmp_path>
<models_config>{models_config}</models_config>
<mark_cache_size>5368709120</mark_cache_size>
<users_config>users.xml</users_config>
<tcp_port>{tcp_port}</tcp_port>
<catboost_dynamic_library_path>{catboost_dynamic_library_path}</catboost_dynamic_library_path>
<logger>
<level>trace</level>
<log>{path}/clickhouse-server.log</log>
<errorlog>{path}/clickhouse-server.err.log</errorlog>
<size>never</size>
<count>50</count>
</logger>
</yandex>
'''
CLICKHOUSE_USERS = \
'''
<yandex>
<profiles>
<default>
</default>
<readonly>
<readonly>1</readonly>
</readonly>
</profiles>
<users>
<readonly>
<password></password>
<profile>readonly</profile>
<quota>default</quota>
</readonly>
<default>
<password></password>
<profile>default</profile>
<quota>default</quota>
<networks incl="networks" replace="replace">
<ip>::1</ip>
<ip>127.0.0.1</ip>
</networks>
</default>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>
'''
CATBOOST_MODEL_CONFIG = \
'''
<models>
<model>
<type>catboost</type>
<name>{name}</name>
<path>{path}</path>
<lifetime>0</lifetime>
</model>
</models>
'''
class ClickHouseServerWithCatboostModels:
def __init__(self, name, binary_path, port, shutdown_timeout=10, clean_folder=False):
self.models = {}
self.name = name
self.binary_path = binary_path
self.port = port
self.shutdown_timeout = shutdown_timeout
self.clean_folder = clean_folder
self.root = os.path.join(CATBOOST_ROOT, 'data', 'servers')
self.config_path = os.path.join(self.root, 'config.xml')
self.users_path = os.path.join(self.root, 'users.xml')
self.models_dir = os.path.join(self.root, 'models')
self.server = None
def _get_server(self):
stdout_file = os.path.join(self.root, 'server_stdout.txt')
stderr_file = os.path.join(self.root, 'server_stderr.txt')
return ClickHouseServer(self.binary_path, self.config_path, stdout_file, stderr_file, self.shutdown_timeout)
def add_model(self, model_name, model):
self.models[model_name] = model
def apply_model(self, name, df, cat_feature_names):
names = list(df)
float_feature_names = tuple(name for name in names if name not in cat_feature_names)
with ClickHouseTable(self.server, self.port, name, df) as table:
return table.apply_model(name, cat_feature_names, float_feature_names)
def _create_root(self):
try:
os.makedirs(self.root)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(self.root):
pass
else:
raise
def _clean_root(self):
rmtree(self.root)
def _save_config(self):
params = {
'tcp_port': self.port,
'path': os.path.join(self.root, 'clickhouse'),
'tmp_path': os.path.join(self.root, 'clickhouse', 'tmp'),
'models_config': os.path.join(self.models_dir, '*_model.xml'),
'catboost_dynamic_library_path': os.path.join(CATBOOST_ROOT, 'data', 'libcatboostmodel.so')
}
config = CLICKHOUSE_CONFIG.format(**params)
with open(self.config_path, 'w') as f:
f.write(config)
with open(self.users_path, 'w') as f:
f.write(CLICKHOUSE_USERS)
def _save_models(self):
if not os.path.exists(self.models_dir):
os.makedirs(self.models_dir)
for name, model in list(self.models.items()):
model_path = os.path.join(self.models_dir, name + '.cbm')
config_path = os.path.join(self.models_dir, name + '_model.xml')
params = {
'name': name,
'path': model_path
}
config = CATBOOST_MODEL_CONFIG.format(**params)
with open(config_path, 'w') as f:
f.write(config)
model.save_model(model_path)
def __enter__(self):
self._create_root()
self._save_config()
self._save_models()
self.server = self._get_server().__enter__()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
res = self.server.__exit__(exc_type, exc_val, exc_tb)
if self.clean_folder:
self._clean_root()
return res

View File

@ -1,74 +0,0 @@
from .server import ClickHouseServer
from .client import ClickHouseClient
from pandas import DataFrame
import os
import threading
import tempfile
class ClickHouseTable:
def __init__(self, server, port, table_name, df):
self.server = server
self.port = port
self.table_name = table_name
self.df = df
if not isinstance(self.server, ClickHouseServer):
raise Exception('Expected ClickHouseServer, got ' + repr(self.server))
if not isinstance(self.df, DataFrame):
raise Exception('Expected DataFrame, got ' + repr(self.df))
self.server.wait_for_request(port)
self.client = ClickHouseClient(server.binary_path, port)
def _convert(self, name):
types_map = {
'float64': 'Float64',
'int64': 'Int64',
'float32': 'Float32',
'int32': 'Int32'
}
if name in types_map:
return types_map[name]
return 'String'
def _create_table_from_df(self):
self.client.query('create database if not exists test')
self.client.query('drop table if exists test.{}'.format(self.table_name))
column_types = list(self.df.dtypes)
column_names = list(self.df)
schema = ', '.join((name + ' ' + self._convert(str(t)) for name, t in zip(column_names, column_types)))
print('schema:', schema)
create_query = 'create table test.{} (date Date DEFAULT today(), {}) engine = MergeTree(date, (date), 8192)'
self.client.query(create_query.format(self.table_name, schema))
insert_query = 'insert into test.{} ({}) format CSV'
with tempfile.TemporaryFile() as tmp_file:
self.df.to_csv(tmp_file, header=False, index=False)
tmp_file.seek(0)
self.client.query(insert_query.format(self.table_name, ', '.join(column_names)), pipe=tmp_file)
def apply_model(self, model_name, float_columns, cat_columns):
columns = ', '.join(list(float_columns) + list(cat_columns))
query = "select modelEvaluate('{}', {}) from test.{} format TSV"
result = self.client.query(query.format(model_name, columns, self.table_name))
def parse_row(row):
values = tuple(map(float, list(filter(len, list(map(str.strip, row.replace('(', '').replace(')', '').split(',')))))))
return values if len(values) != 1 else values[0]
return tuple(map(parse_row, list(filter(len, list(map(str.strip, result.split('\n')))))))
def _drop_table(self):
self.client.query('drop table test.{}'.format(self.table_name))
def __enter__(self):
self._create_table_from_df()
return self
def __exit__(self, type, value, traceback):
self._drop_table()

View File

@ -1,28 +0,0 @@
import os
import sys
from pandas import DataFrame
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CATBOOST_ROOT = os.path.dirname(SCRIPT_DIR)
CATBOOST_PYTHON_DIR = os.path.join(CATBOOST_ROOT, 'data', 'python-package')
if CATBOOST_PYTHON_DIR not in sys.path:
sys.path.append(CATBOOST_PYTHON_DIR)
import catboost
from catboost import CatBoostClassifier
def train_catboost_model(df, target, cat_features, params, verbose=True):
if not isinstance(df, DataFrame):
raise Exception('DataFrame object expected, but got ' + repr(df))
print('features:', df.columns.tolist())
cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features)
print('cat features:', cat_features_index)
model = CatBoostClassifier(**params)
model.fit(df, target, cat_features=cat_features_index, verbose=verbose)
return model

View File

@ -1,3 +0,0 @@
[pytest]
python_files = test.py
norecursedirs=data

View File

@ -1,294 +0,0 @@
from helpers.server_with_models import ClickHouseServerWithCatboostModels
from helpers.generate import generate_uniform_string_column, generate_uniform_float_column, generate_uniform_int_column
from helpers.train import train_catboost_model
import os
import numpy as np
from pandas import DataFrame
PORT = int(os.environ.get('CLICKHOUSE_TESTS_PORT', '9000'))
CLICKHOUSE_TESTS_SERVER_BIN_PATH = os.environ.get('CLICKHOUSE_TESTS_SERVER_BIN_PATH', '/usr/bin/clickhouse')
def add_noise_to_target(target, seed, threshold=0.05):
col = generate_uniform_float_column(len(target), 0., 1., seed + 1) < threshold
return target * (1 - col) + (1 - target) * col
def check_predictions(test_name, target, pred_python, pred_ch, acc_threshold):
ch_class = pred_ch.astype(int)
python_class = pred_python.astype(int)
if not np.array_equal(ch_class, python_class):
raise Exception('Got different results:\npython:\n' + str(python_class) + '\nClickHouse:\n' + str(ch_class))
acc = 1 - np.sum(np.abs(ch_class - np.array(target))) / (len(target) + .0)
assert acc >= acc_threshold
print(test_name, 'accuracy: {:.10f}'.format(acc))
def test_apply_float_features_only():
name = 'test_apply_float_features_only'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_float_column(size, 0., 1., seed + 3)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3) or (row['c'] < .4 and row['a'] * row['b'] > 0.1) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 4,
'depth': 2,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, [], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_string_cat_features():
name = 'test_apply_float_features_with_string_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_string_column(size, ['e', 'f', 'g'], seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 'e') else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 2,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_int_cat_features():
name = 'test_apply_float_features_with_int_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_int_column(size, 1, 4, seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 1) \
or (row['a'] * row['b'] > 0.1 and row['c'] != 2 and row['d'] != 3) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 4,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_float_features_with_mixed_cat_features():
name = 'test_apply_float_features_with_mixed_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
return 1 if (row['a'] > .3 and row['b'] > .3 and row['c'] != 'a') \
or (row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2) else 0
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 6,
'depth': 4,
'learning_rate': 1,
'loss_function': 'Logloss'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)
def test_apply_multiclass():
name = 'test_apply_float_features_with_mixed_cat_features'
train_size = 10000
test_size = 10000
def gen_data(size, seed):
data = {
'a': generate_uniform_float_column(size, 0., 1., seed + 1),
'b': generate_uniform_float_column(size, 0., 1., seed + 2),
'c': generate_uniform_string_column(size, ['a', 'b', 'c'], seed + 3),
'd': generate_uniform_int_column(size, 1, 4, seed + 4)
}
return DataFrame.from_dict(data)
def get_target(df):
def target_filter(row):
if row['a'] > .3 and row['b'] > .3 and row['c'] != 'a':
return 0
elif row['a'] * row['b'] > 0.1 and row['c'] != 'b' and row['d'] != 2:
return 1
else:
return 2
return df.apply(target_filter, axis=1).as_matrix()
train_df = gen_data(train_size, 42)
test_df = gen_data(test_size, 43)
train_target = get_target(train_df)
test_target = get_target(test_df)
print()
print('train target', train_target)
print('test target', test_target)
params = {
'iterations': 10,
'depth': 4,
'learning_rate': 1,
'loss_function': 'MultiClass'
}
model = train_catboost_model(train_df, train_target, ['c', 'd'], params)
pred_python = model.predict(test_df)[:,0].astype(int)
server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT)
server.add_model(name, model)
with server:
pred_ch = np.argmax(np.array(server.apply_model(name, test_df, [])), axis=1)
print('python predictions', pred_python)
print('clickhouse predictions', pred_ch)
check_predictions(name, test_target, pred_python, pred_ch, 0.9)