ClickHouse/tests/integration/test_catboost_evaluate/test.py

import os
import sys
import time

import pytest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)

instance = cluster.add_instance(
    "instance",
    stay_alive=True,
    main_configs=["config/models_config.xml", "config/logger_library_bridge.xml"],
)


@pytest.fixture(scope="module")
def ch_cluster():
    try:
        cluster.start()

        instance.exec_in_container(["mkdir", f"/etc/clickhouse-server/model/"])

        machine = instance.get_machine_name()
        for source_name in os.listdir(os.path.join(SCRIPT_DIR, "model/.")):
            dest_name = source_name
            if machine in source_name:
                machine_suffix = "_" + machine
                dest_name = source_name[: -len(machine_suffix)]

            os.system(
                "docker cp {local} {cont_id}:{dist}".format(
                    local=os.path.join(SCRIPT_DIR, f"model/{source_name}"),
                    cont_id=instance.docker_id,
                    dist=f"/etc/clickhouse-server/model/{dest_name}",
                )
            )

        instance.restart_clickhouse()

        yield cluster

    finally:
        cluster.shutdown()


# ---------------------------------------------------------------------------
# simple_model.bin has 2 float features and 9 categorical features


def testConstantFeatures(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    expected = "-1.930268705869267\n"
    assert result == expected


def testNonConstantFeatures(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    instance.query("DROP TABLE IF EXISTS T;")
    instance.query(
        "CREATE TABLE T(ID UInt32, F1 Float32, F2 Float32, F3 UInt32, F4 UInt32, F5 UInt32, F6 UInt32, F7 UInt32, F8 UInt32, F9 Float32, F10 Float32, F11 Float32) ENGINE MergeTree ORDER BY ID;"
    )
    instance.query("INSERT INTO T VALUES(0, 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);")

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11) from T;"
    )
    expected = "-1.930268705869267\n"
    assert result == expected

    instance.query("DROP TABLE IF EXISTS T;")


def testModelPathIsNotAConstString(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    err = instance.query_and_get_error(
        "select catboostEvaluate(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert (
        "Illegal type UInt8 of first argument of function catboostEvaluate, expected a string"
        in err
    )

    instance.query("DROP TABLE IF EXISTS T;")
    instance.query("CREATE TABLE T(ID UInt32, A String) ENGINE MergeTree ORDER BY ID")
    instance.query("INSERT INTO T VALUES(0, 'test');")
    err = instance.query_and_get_error(
        "select catboostEvaluate(A, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) FROM T;"
    )
    assert (
        "First argument of function catboostEvaluate must be a constant string" in err
    )
    instance.query("DROP TABLE IF EXISTS T;")


def testWrongNumberOfFeatureArguments(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin');"
    )
    assert "Function catboostEvaluate expects at least 2 arguments" in err

    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2);"
    )
    assert (
        "Number of columns is different with number of features: columns size 2 float features size 2 + cat features size 9"
        in err
    )


def testFloatFeatureMustBeNumeric(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 'a', 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert "Column 1 should be numeric to make float feature" in err


def testCategoricalFeatureMustBeNumericOrString(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, tuple(8), 9, 10, 11);"
    )
    assert "Column 7 should be numeric or string" in err


def testOnLowCardinalityFeatures(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    # same but on domain-compressed data
    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toLowCardinality(1.0), toLowCardinality(2.0), toLowCardinality(3), toLowCardinality(4), toLowCardinality(5), toLowCardinality(6), toLowCardinality(7), toLowCardinality(8), toLowCardinality(9), toLowCardinality(10), toLowCardinality(11));"
    )
    expected = "-1.930268705869267\n"
    assert result == expected


def testOnNullableFeatures(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(1.0), toNullable(2.0), toNullable(3), toNullable(4), toNullable(5), toNullable(6), toNullable(7), toNullable(8), toNullable(9), toNullable(10), toNullable(11));"
    )
    expected = "-1.930268705869267\n"
    assert result == expected

    # Actual NULLs are disallowed
    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL));"
    )
    assert "Column 0 should be numeric to make float feature" in err


def testInvalidLibraryPath(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    # temporarily move library elsewhere
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/libcatboostmodel.so /etc/clickhouse-server/model/nonexistant.so",
        ]
    )

    err = instance.query_and_get_error(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert (
        "Can't load library /etc/clickhouse-server/model/libcatboostmodel.so: file doesn't exist"
        in err
    )

    # restore
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/nonexistant.so /etc/clickhouse-server/model/libcatboostmodel.so",
        ]
    )


def testInvalidModelPath(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    err = instance.query_and_get_error(
        "select catboostEvaluate('', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert "Can't load model : file doesn't exist" in err

    err = instance.query_and_get_error(
        "select catboostEvaluate('model_non_existant.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert "Can't load model model_non_existant.bin: file doesn't exist" in err


def testRecoveryAfterCrash(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    expected = "-1.930268705869267\n"
    assert result == expected

    instance.exec_in_container(
        ["bash", "-c", "kill -9 `pidof clickhouse-library-bridge`"], user="root"
    )

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    assert result == expected


# ---------------------------------------------------------------------------
# amazon_model.bin has 0 float features and 9 categorical features


def testAmazonModelSingleRow(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);"
    )
    expected = "0.7774665009089274\n"
    assert result == expected


def testAmazonModelManyRows(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    result = instance.query("drop table if exists amazon")

    result = instance.query(
        "create table amazon ( DATE Date materialized today(), ACTION UInt8, RESOURCE UInt32, MGR_ID UInt32, ROLE_ROLLUP_1 UInt32, ROLE_ROLLUP_2 UInt32, ROLE_DEPTNAME UInt32, ROLE_TITLE UInt32, ROLE_FAMILY_DESC UInt32, ROLE_FAMILY UInt32, ROLE_CODE UInt32) engine = MergeTree order by DATE"
    )

    result = instance.query(
        "insert into amazon select number % 256, number, number, number, number, number, number, number, number, number from numbers(750000)"
    )

    # First compute prediction, then as a very crude way to fingerprint and compare the result: sum and floor
    # (the focus is to test that the exchange of large result sets between the server and the bridge works)
    result = instance.query(
        "SELECT floor(sum(catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', RESOURCE, MGR_ID, ROLE_ROLLUP_1, ROLE_ROLLUP_2, ROLE_DEPTNAME, ROLE_TITLE, ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE))) FROM amazon"
    )

    expected = "583092\n"
    assert result == expected

    result = instance.query("drop table if exists amazon")


def testModelUpdate(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    query = "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"

    result = instance.query(query)
    expected = "-1.930268705869267\n"
    assert result == expected

    # simulate an update of the model: temporarily move the amazon model in place of the simple model
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/simple_model.bin.bak",
        ]
    )
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/amazon_model.bin /etc/clickhouse-server/model/simple_model.bin",
        ]
    )

    # unload simple model
    result = instance.query(
        "system reload model '/etc/clickhouse-server/model/simple_model.bin'"
    )

    # load the simple-model-camouflaged amazon model
    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);"
    )
    expected = "0.7774665009089274\n"
    assert result == expected

    # restore
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/amazon_model.bin",
        ]
    )
    instance.exec_in_container(
        [
            "bash",
            "-c",
            "mv /etc/clickhouse-server/model/simple_model.bin.bak /etc/clickhouse-server/model/simple_model.bin",
        ]
    )


def testSystemModelsAndModelRefresh(ch_cluster):
    if instance.is_built_with_memory_sanitizer():
        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")

    result = instance.query("system reload models")

    # check model system view
    result = instance.query("select * from system.models")
    expected = ""
    assert result == expected

    # load simple model
    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);"
    )
    expected = "-1.930268705869267\n"
    assert result == expected

    # check model system view with one model loaded
    result = instance.query("select * from system.models")
    assert result.count("\n") == 1
    expected = "/etc/clickhouse-server/model/simple_model.bin"
    assert expected in result

    # load amazon model
    result = instance.query(
        "select catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);"
    )
    expected = "0.7774665009089274\n"
    assert result == expected

    # check model system view with one model loaded
    result = instance.query("select * from system.models")
    assert result.count("\n") == 2
    expected = "/etc/clickhouse-server/model/simple_model.bin"
    assert expected in result
    expected = "/etc/clickhouse-server/model/amazon_model.bin"
    assert expected in result

    # unload simple model
    result = instance.query(
        "system reload model '/etc/clickhouse-server/model/simple_model.bin'"
    )

    # check model system view, it should not display the removed model
    result = instance.query("select * from system.models")
    assert result.count("\n") == 1
    expected = "/etc/clickhouse-server/model/amazon_model.bin"
    assert expected in result