From 2ae3f0b3b4da5314c01d05a7fde578bb2ab94fe4 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 31 Oct 2017 15:22:42 +0300 Subject: [PATCH] fixed tests [#CLICKHOUSE-3305] added aliases for catboost pool [#CLICKHOUSE-3305] --- dbms/src/Storages/StorageCatBoostPool.cpp | 29 +++++++++++++++++-- dbms/src/Storages/StorageCatBoostPool.h | 5 ++-- .../catboost/helpers/server_with_models.py | 13 +++------ .../test_apply_catboost_model/test.py | 8 ++--- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/dbms/src/Storages/StorageCatBoostPool.cpp b/dbms/src/Storages/StorageCatBoostPool.cpp index 1aff70606a9..2aec0277af6 100644 --- a/dbms/src/Storages/StorageCatBoostPool.cpp +++ b/dbms/src/Storages/StorageCatBoostPool.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace DB { @@ -167,6 +168,10 @@ void StorageCatBoostPool::parseColumnDescription() auto column_types_map = getColumnTypesMap(); auto column_types_string = getColumnTypesString(column_types_map); + /// Enumerate default names for columns as Auxiliary, Auxiliary1, Auxiliary2, ... + std::map columns_per_type_count; + size_t features_column_count = 0; + while (std::getline(in, line)) { ++line_num; @@ -188,7 +193,7 @@ void StorageCatBoostPool::parseColumnDescription() std::string str_id = tokens[0]; std::string col_type = tokens[1]; - std::string col_name = "feature" + (tokens.size() > 2 ? tokens[2] : str_id); + std::string col_alias = tokens.size() > 2 ? tokens[2] : ""; size_t num_id; try @@ -211,9 +216,20 @@ void StorageCatBoostPool::parseColumnDescription() ErrorCodes::CANNOT_PARSE_TEXT); auto type = column_types_map[col_type]; + + std::string col_name; if (type != DatasetColumnType::Num && type != DatasetColumnType::Categ) - col_name = col_type; - columns_description[num_id] = ColumnDescription(col_name, type); + { + auto & col_number = columns_per_type_count[type]; + col_name = col_type + (col_number ? std::to_string(col_number) : ""); + ++col_number; + } + else + { + col_name = "feature" + std::to_string(features_column_count); + ++features_column_count; + } + columns_description[num_id] = ColumnDescription(col_name, col_alias, type); } } @@ -240,6 +256,13 @@ void StorageCatBoostPool::createSampleBlockAndColumns() else materialized_columns.emplace_back(desc.column_name, type); + if (!desc.alias.empty()) + { + auto alias = std::make_shared(); + alias->name = desc.alias; + column_defaults[desc.alias] = {ColumnDefaultType::Alias, alias}; + } + sample_block.insert(ColumnWithTypeAndName(type->createColumn(), type, desc.column_name)); } columns.insert(columns.end(), num_columns.begin(), num_columns.end()); diff --git a/dbms/src/Storages/StorageCatBoostPool.h b/dbms/src/Storages/StorageCatBoostPool.h index 0f5d2803976..e20f860db54 100644 --- a/dbms/src/Storages/StorageCatBoostPool.h +++ b/dbms/src/Storages/StorageCatBoostPool.h @@ -67,11 +67,12 @@ private: struct ColumnDescription { std::string column_name; + std::string alias; DatasetColumnType column_type; ColumnDescription() : column_type(DatasetColumnType::Num) {} - ColumnDescription(std::string column_name, DatasetColumnType column_type) - : column_name(std::move(column_name)), column_type(column_type) {} + ColumnDescription(std::string column_name, std::string alias, DatasetColumnType column_type) + : column_name(std::move(column_name)), alias(std::move(alias)), column_type(column_type) {} }; std::vector columns_description; diff --git a/dbms/tests/external_models/catboost/helpers/server_with_models.py b/dbms/tests/external_models/catboost/helpers/server_with_models.py index 7f64fdb4cdb..e0ed81980e1 100644 --- a/dbms/tests/external_models/catboost/helpers/server_with_models.py +++ b/dbms/tests/external_models/catboost/helpers/server_with_models.py @@ -67,8 +67,6 @@ CATBOOST_MODEL_CONFIG = \ catboost {name} {path} - {float_features_count} - {cat_features_count} 0 @@ -94,8 +92,8 @@ class ClickHouseServerWithCatboostModels: stderr_file = os.path.join(self.root, 'server_stderr.txt') return ClickHouseServer(self.binary_path, self.config_path, stdout_file, stderr_file, self.shutdown_timeout) - def add_model(self, model_name, model, float_features_count, cat_features_count): - self.models[model_name] = (float_features_count, cat_features_count, model) + def add_model(self, model_name, model): + self.models[model_name] = model def apply_model(self, name, df, cat_feature_names): names = list(df) @@ -135,15 +133,12 @@ class ClickHouseServerWithCatboostModels: if not os.path.exists(self.models_dir): os.makedirs(self.models_dir) - for name, params in self.models.items(): - float_features_count, cat_features_count, model = params + for name, model in self.models.items(): model_path = os.path.join(self.models_dir, name + '.cbm') config_path = os.path.join(self.models_dir, name + '_model.xml') params = { 'name': name, - 'path': model_path, - 'float_features_count': float_features_count, - 'cat_features_count': cat_features_count + 'path': model_path } config = CATBOOST_MODEL_CONFIG.format(**params) with open(config_path, 'w') as f: diff --git a/dbms/tests/external_models/catboost/test_apply_catboost_model/test.py b/dbms/tests/external_models/catboost/test_apply_catboost_model/test.py index f2fa040a77f..792ba9a13c8 100644 --- a/dbms/tests/external_models/catboost/test_apply_catboost_model/test.py +++ b/dbms/tests/external_models/catboost/test_apply_catboost_model/test.py @@ -67,7 +67,7 @@ def test_apply_float_features_only(): pred_python = model.predict(test_df) server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT) - server.add_model(name, model, 3, 0) + server.add_model(name, model) with server: pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int) @@ -120,7 +120,7 @@ def test_apply_float_features_with_string_cat_features(): pred_python = model.predict(test_df) server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT) - server.add_model(name, model, 2, 2) + server.add_model(name, model) with server: pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int) @@ -173,7 +173,7 @@ def test_apply_float_features_with_int_cat_features(): pred_python = model.predict(test_df) server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT) - server.add_model(name, model, 2, 2) + server.add_model(name, model) with server: pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int) @@ -226,7 +226,7 @@ def test_apply_float_features_with_mixed_cat_features(): pred_python = model.predict(test_df) server = ClickHouseServerWithCatboostModels(name, CLICKHOUSE_TESTS_SERVER_BIN_PATH, PORT) - server.add_model(name, model, 2, 2) + server.add_model(name, model) with server: pred_ch = (np.array(server.apply_model(name, test_df, [])) > 0).astype(int)