mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
6b2b3c1eb3
This commit moves the catboost model evaluation out of the server process into the library-bridge binary. This serves two goals: On the one hand, crashes / memory corruptions of the catboost library no longer affect the server. On the other hand, we can forbid loading dynamic libraries in the server (catboost was the last consumer of this functionality), thus improving security. SQL syntax: SELECT catboostEvaluate('/path/to/model.bin', FEAT_1, ..., FEAT_N) > 0 AS prediction, ACTION AS target FROM amazon_train LIMIT 10 Required configuration: <catboost_lib_path>/path/to/libcatboostmodel.so</catboost_lib_path> *** Implementation Details *** The internal protocol between the server and the library-bridge is simple: - HTTP GET on path "/extdict_ping": A ping, used during the handshake to check if the library-bridge runs. - HTTP POST on path "extdict_request" (1) Send a "catboost_GetTreeCount" request from the server to the bridge, containing a library path (e.g /home/user/libcatboost.so) and a model path (e.g. /home/user/model.bin). Rirst, this unloads the catboost library handler associated to the model path (if it was loaded), then loads the catboost library handler associated to the model path, then executes GetTreeCount() on the library handler and finally sends the result back to the server. Step (1) is called once by the server from FunctionCatBoostEvaluate::getReturnTypeImpl(). The library path handler is unloaded in the beginning because it contains state which may no longer be valid if the user runs catboost("/path/to/model.bin", ...) more than once and if "model.bin" was updated in between. (2) Send "catboost_Evaluate" from the server to the bridge, containing the model path and the features to run the interference on. Step (2) is called multiple times (once per chunk) by the server from function FunctionCatBoostEvaluate::executeImpl(). The library handler for the given model path is expected to be already loaded by Step (1). Fixes #27870
203 lines
9.7 KiB
Python
203 lines
9.7 KiB
Python
import os
|
|
import sys
|
|
import time
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
from helpers.cluster import ClickHouseCluster
|
|
|
|
cluster = ClickHouseCluster(__file__)
|
|
|
|
instance = cluster.add_instance(
|
|
"instance", stay_alive=True, main_configs=["config/models_config.xml"]
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def ch_cluster():
|
|
try:
|
|
cluster.start()
|
|
|
|
os.system(
|
|
"docker cp {local} {cont_id}:{dist}".format(
|
|
local=os.path.join(SCRIPT_DIR, "model/."), cont_id=instance.docker_id, dist="/etc/clickhouse-server/model"
|
|
)
|
|
)
|
|
instance.restart_clickhouse()
|
|
|
|
yield cluster
|
|
|
|
finally:
|
|
cluster.shutdown()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# simple_model.bin has 2 float features and 9 categorical features
|
|
|
|
def testConstantFeatures(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
|
|
def testNonConstantFeatures(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
instance.query("DROP TABLE IF EXISTS T;")
|
|
instance.query("CREATE TABLE T(ID UInt32, F1 Float32, F2 Float32, F3 UInt32, F4 UInt32, F5 UInt32, F6 UInt32, F7 UInt32, F8 UInt32, F9 Float32, F10 Float32, F11 Float32) ENGINE MergeTree ORDER BY ID;")
|
|
instance.query("INSERT INTO T VALUES(0, 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11) from T;")
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
instance.query("DROP TABLE IF EXISTS T;")
|
|
|
|
|
|
def testModelPathIsNotAConstString(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert "Illegal type UInt8 of first argument of function catboostEvaluate, expected a string" in err
|
|
|
|
instance.query("DROP TABLE IF EXISTS T;")
|
|
instance.query("CREATE TABLE T(ID UInt32, A String) ENGINE MergeTree ORDER BY ID")
|
|
instance.query("INSERT INTO T VALUES(0, 'test');")
|
|
err = instance.query_and_get_error("select catboostEvaluate(A, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) FROM T;")
|
|
assert "First argument of function catboostEvaluate must be a constant string" in err
|
|
instance.query("DROP TABLE IF EXISTS T;")
|
|
|
|
|
|
def testWrongNumberOfFeatureArguments(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin');")
|
|
assert "Function catboostEvaluate expects at least 2 arguments" in err
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2);")
|
|
assert "Number of columns is different with number of features: columns size 2 float features size 2 + cat features size 9" in err
|
|
|
|
|
|
def testFloatFeatureMustBeNumeric(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 'a', 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert "Column 1 should be numeric to make float feature" in err
|
|
|
|
|
|
def testCategoricalFeatureMustBeNumericOrString(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, tuple(8), 9, 10, 11);")
|
|
assert "Column 7 should be numeric or string" in err
|
|
|
|
|
|
def testOnLowCardinalityFeatures(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
# same but on domain-compressed data
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toLowCardinality(1.0), toLowCardinality(2.0), toLowCardinality(3), toLowCardinality(4), toLowCardinality(5), toLowCardinality(6), toLowCardinality(7), toLowCardinality(8), toLowCardinality(9), toLowCardinality(10), toLowCardinality(11));")
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
|
|
def testOnNullableFeatures(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(1.0), toNullable(2.0), toNullable(3), toNullable(4), toNullable(5), toNullable(6), toNullable(7), toNullable(8), toNullable(9), toNullable(10), toNullable(11));")
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
# Actual NULLs are disallowed
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL));")
|
|
assert "Column 0 should be numeric to make float feature" in err
|
|
|
|
|
|
def testInvalidLibraryPath(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
# temporarily move library elsewhere
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/libcatboostmodel.so /etc/clickhouse-server/model/nonexistant.so"])
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert "Can't load library /etc/clickhouse-server/model/libcatboostmodel.so: file doesn't exist" in err
|
|
|
|
# restore
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/nonexistant.so /etc/clickhouse-server/model/libcatboostmodel.so"])
|
|
|
|
|
|
def testInvalidModelPath(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert "Can't load model : file doesn't exist" in err
|
|
|
|
err = instance.query_and_get_error("select catboostEvaluate('model_non_existant.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert "Can't load model model_non_existant.bin: file doesn't exist" in err
|
|
|
|
|
|
def testRecoveryAfterCrash(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
instance.exec_in_container(
|
|
["bash", "-c", "kill -9 `pidof clickhouse-library-bridge`"], user="root"
|
|
)
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);")
|
|
assert result == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# amazon_model.bin has 0 float features and 9 categorical features
|
|
|
|
def testAmazonModel(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
result = instance.query("select catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);")
|
|
expected = "0.7774665009089274\n"
|
|
assert result == expected
|
|
|
|
|
|
def testModelUpdate(ch_cluster):
|
|
if instance.is_built_with_memory_sanitizer():
|
|
pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
|
|
|
|
query = "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
|
|
|
|
result = instance.query(query)
|
|
expected = "-1.930268705869267\n"
|
|
assert result == expected
|
|
|
|
# simulate an update of the model: temporarily move the amazon model in place of the simple model
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/simple_model.bin.bak"])
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/amazon_model.bin /etc/clickhouse-server/model/simple_model.bin"])
|
|
|
|
# since the amazon model has a different number of features than the simple model, we should get an error
|
|
err = instance.query_and_get_error(query)
|
|
assert "Number of columns is different with number of features: columns size 11 float features size 0 + cat features size 9" in err
|
|
|
|
# restore
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/amazon_model.bin"])
|
|
instance.exec_in_container(["bash", "-c", "mv /etc/clickhouse-server/model/simple_model.bin.bak /etc/clickhouse-server/model/simple_model.bin"])
|