Merge pull request #47970 from ClickHouse/rs/fix-catboost

Fix "Field value too long" in catboostEvaluate()
This commit is contained in:
robot-ch-test-poll3 2023-03-26 04:16:42 +02:00 committed by GitHub
commit b18c051943
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 10 deletions

View File

@ -4,21 +4,22 @@
#include <Poco/Net/NetException.h>
#include <Poco/Util/HelpFormatter.h>
#include <base/range.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/SensitiveDataMasker.h>
#include "config.h"
#include <Common/StringUtils/StringUtils.h>
#include <Common/logger_useful.h>
#include <base/errnoToString.h>
#include <IO/ReadHelpers.h>
#include <Formats/registerFormats.h>
#include <Server/HTTP/HTTPServer.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteHelpers.h>
#include <Server/HTTP/HTTPServer.h>
#include <base/errnoToString.h>
#include <base/range.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "config.h"
#if USE_ODBC
# include <Poco/Data/ODBC/Connector.h>
#endif
@ -89,7 +90,7 @@ void IBridge::defineOptions(Poco::Util::OptionSet & options)
Poco::Util::Option("listen-host", "", "hostname or address to listen, default 127.0.0.1").argument("listen-host").binding("listen-host"));
options.addOption(
Poco::Util::Option("http-timeout", "", "http timeout for socket, default 1800").argument("http-timeout").binding("http-timeout"));
Poco::Util::Option("http-timeout", "", "http timeout for socket, default 180").argument("http-timeout").binding("http-timeout"));
options.addOption(
Poco::Util::Option("max-server-connections", "", "max connections to server, default 1024").argument("max-server-connections").binding("max-server-connections"));
@ -97,6 +98,9 @@ void IBridge::defineOptions(Poco::Util::OptionSet & options)
options.addOption(
Poco::Util::Option("keep-alive-timeout", "", "keepalive timeout, default 10").argument("keep-alive-timeout").binding("keep-alive-timeout"));
options.addOption(
Poco::Util::Option("http-max-field-value-size", "", "max http field value size, default 1048576").argument("http-max-field-value-size").binding("http-max-field-value-size"));
options.addOption(
Poco::Util::Option("log-level", "", "sets log level, default info") .argument("log-level").binding("logger.level"));
@ -165,6 +169,7 @@ void IBridge::initialize(Application & self)
http_timeout = config().getUInt64("http-timeout", DEFAULT_HTTP_READ_BUFFER_TIMEOUT);
max_server_connections = config().getUInt("max-server-connections", 1024);
keep_alive_timeout = config().getUInt64("keep-alive-timeout", 10);
http_max_field_value_size = config().getUInt64("http-max-field-value-size", 1048576);
struct rlimit limit;
const UInt64 gb = 1024 * 1024 * 1024;
@ -226,6 +231,10 @@ int IBridge::main(const std::vector<std::string> & /*args*/)
auto context = Context::createGlobal(shared_context.get());
context->makeGlobalContext();
auto settings = context->getSettings();
settings.set("http_max_field_value_size", http_max_field_value_size);
context->setSettings(settings);
if (config().has("query_masking_rules"))
SensitiveDataMasker::setInstance(std::make_unique<SensitiveDataMasker>(config(), "query_masking_rules"));

View File

@ -45,6 +45,7 @@ private:
std::string log_level;
unsigned max_server_connections;
size_t http_timeout;
size_t http_max_field_value_size;
Poco::Logger * log;
};

View File

@ -67,6 +67,8 @@ std::unique_ptr<ShellCommand> IBridgeHelper::startBridgeCommand()
cmd_args.push_back(config.getString(configPrefix() + ".listen_host", DEFAULT_HOST));
cmd_args.push_back("--http-timeout");
cmd_args.push_back(std::to_string(getHTTPTimeout().totalMicroseconds()));
cmd_args.push_back("--http-max-field-value-size");
cmd_args.push_back("99999999999999999"); // something "big" to accept large datasets (issue 47616)
if (config.has("logger." + configPrefix() + "_log"))
{
cmd_args.push_back("--log-path");

View File

@ -279,7 +279,7 @@ def testAmazonModelManyRows(ch_cluster):
)
result = instance.query(
"insert into amazon select number % 256, number, number, number, number, number, number, number, number, number from numbers(7500)"
"insert into amazon select number % 256, number, number, number, number, number, number, number, number, number from numbers(750000)"
)
# First compute prediction, then as a very crude way to fingerprint and compare the result: sum and floor
@ -288,7 +288,7 @@ def testAmazonModelManyRows(ch_cluster):
"SELECT floor(sum(catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', RESOURCE, MGR_ID, ROLE_ROLLUP_1, ROLE_ROLLUP_2, ROLE_DEPTNAME, ROLE_TITLE, ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE))) FROM amazon"
)
expected = "5834\n"
expected = "583092\n"
assert result == expected
result = instance.query("drop table if exists amazon")