2018-06-05 19:46:49 +00:00
|
|
|
#include <fstream>
|
|
|
|
#include <sstream>
|
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
2017-10-30 14:38:14 +00:00
|
|
|
#include <Storages/StorageCatBoostPool.h>
|
2019-01-23 14:48:50 +00:00
|
|
|
#include <DataStreams/IBlockInputStream.h>
|
2018-06-10 19:22:49 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
2017-10-30 14:38:14 +00:00
|
|
|
#include <IO/ReadBufferFromFile.h>
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
|
|
#include <DataStreams/FilterColumnsBlockInputStream.h>
|
2017-10-30 18:41:08 +00:00
|
|
|
#include <Interpreters/Context.h>
|
2017-10-31 12:22:42 +00:00
|
|
|
#include <Parsers/ASTIdentifier.h>
|
2017-10-30 14:38:14 +00:00
|
|
|
|
2018-06-05 19:46:49 +00:00
|
|
|
|
2017-10-30 14:38:14 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_OPEN_FILE;
|
|
|
|
extern const int CANNOT_PARSE_TEXT;
|
2017-10-30 18:41:08 +00:00
|
|
|
extern const int DATABASE_ACCESS_DENIED;
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
2019-01-23 14:48:50 +00:00
|
|
|
class CatBoostDatasetBlockInputStream : public IBlockInputStream
|
2017-10-30 14:38:14 +00:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
|
|
|
CatBoostDatasetBlockInputStream(const std::string & file_name, const std::string & format_name,
|
2019-02-10 16:55:12 +00:00
|
|
|
const Block & sample_block, const Context & context, UInt64 max_block_size)
|
2017-10-30 14:38:14 +00:00
|
|
|
: file_name(file_name), format_name(format_name)
|
|
|
|
{
|
|
|
|
read_buf = std::make_unique<ReadBufferFromFile>(file_name);
|
2018-06-10 19:22:49 +00:00
|
|
|
reader = FormatFactory::instance().getInput(format_name, *read_buf, sample_block, context, max_block_size);
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
String getName() const override
|
|
|
|
{
|
2018-02-21 20:23:27 +00:00
|
|
|
return "CatBoostDataset";
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Block readImpl() override
|
|
|
|
{
|
|
|
|
return reader->read();
|
|
|
|
}
|
|
|
|
|
|
|
|
void readPrefixImpl() override
|
|
|
|
{
|
|
|
|
reader->readPrefix();
|
|
|
|
}
|
|
|
|
|
|
|
|
void readSuffixImpl() override
|
|
|
|
{
|
|
|
|
reader->readSuffix();
|
|
|
|
}
|
|
|
|
|
2018-08-20 10:33:14 +00:00
|
|
|
Block getHeader() const override { return reader->getHeader(); }
|
2018-01-06 18:10:44 +00:00
|
|
|
|
2017-10-30 14:38:14 +00:00
|
|
|
private:
|
|
|
|
std::unique_ptr<ReadBufferFromFileDescriptor> read_buf;
|
|
|
|
BlockInputStreamPtr reader;
|
|
|
|
std::string file_name;
|
|
|
|
std::string format_name;
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2017-10-30 18:41:08 +00:00
|
|
|
static boost::filesystem::path canonicalPath(std::string && path)
|
|
|
|
{
|
|
|
|
return boost::filesystem::canonical(boost::filesystem::path(path));
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string resolvePath(const boost::filesystem::path & base_path, std::string && path)
|
|
|
|
{
|
|
|
|
boost::filesystem::path resolved_path(path);
|
|
|
|
if (!resolved_path.is_absolute())
|
2018-08-20 09:51:11 +00:00
|
|
|
return boost::filesystem::canonical(resolved_path, base_path).string();
|
|
|
|
return boost::filesystem::canonical(resolved_path).string();
|
2017-10-30 18:41:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void checkCreationIsAllowed(const String & base_path, const String & path)
|
|
|
|
{
|
|
|
|
if (base_path != path.substr(0, base_path.size()))
|
|
|
|
throw Exception(
|
|
|
|
"Using file descriptor or user specified path as source of storage isn't allowed for server daemons",
|
|
|
|
ErrorCodes::DATABASE_ACCESS_DENIED);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
StorageCatBoostPool::StorageCatBoostPool(const Context & context,
|
|
|
|
String column_description_file_name_,
|
|
|
|
String data_description_file_name_)
|
|
|
|
: column_description_file_name(std::move(column_description_file_name_)),
|
|
|
|
data_description_file_name(std::move(data_description_file_name_))
|
2017-10-30 14:38:14 +00:00
|
|
|
{
|
2017-10-30 18:41:08 +00:00
|
|
|
auto base_path = canonicalPath(context.getPath());
|
|
|
|
column_description_file_name = resolvePath(base_path, std::move(column_description_file_name));
|
|
|
|
data_description_file_name = resolvePath(base_path, std::move(data_description_file_name));
|
|
|
|
if (context.getApplicationType() == Context::ApplicationType::SERVER)
|
|
|
|
{
|
|
|
|
const auto & base_path_str = base_path.string();
|
|
|
|
checkCreationIsAllowed(base_path_str, column_description_file_name);
|
|
|
|
checkCreationIsAllowed(base_path_str, data_description_file_name);
|
|
|
|
}
|
|
|
|
|
2017-10-30 14:38:14 +00:00
|
|
|
parseColumnDescription();
|
|
|
|
createSampleBlockAndColumns();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string StorageCatBoostPool::getColumnTypesString(const ColumnTypesMap & columnTypesMap)
|
|
|
|
{
|
|
|
|
std::string types_string;
|
|
|
|
bool first = true;
|
|
|
|
for (const auto & value : columnTypesMap)
|
|
|
|
{
|
|
|
|
if (!first)
|
|
|
|
types_string.append(", ");
|
|
|
|
|
|
|
|
first = false;
|
|
|
|
types_string += value.first;
|
|
|
|
}
|
|
|
|
|
|
|
|
return types_string;
|
|
|
|
}
|
|
|
|
|
|
|
|
void StorageCatBoostPool::checkDatasetDescription()
|
|
|
|
{
|
|
|
|
std::ifstream in(data_description_file_name);
|
|
|
|
if (!in.good())
|
|
|
|
throw Exception("Cannot open file: " + data_description_file_name, ErrorCodes::CANNOT_OPEN_FILE);
|
|
|
|
|
|
|
|
std::string line;
|
|
|
|
if (!std::getline(in, line))
|
|
|
|
throw Exception("File is empty: " + data_description_file_name, ErrorCodes::CANNOT_PARSE_TEXT);
|
|
|
|
|
|
|
|
size_t columns_count = 1;
|
|
|
|
for (char sym : line)
|
|
|
|
if (sym == '\t')
|
|
|
|
++columns_count;
|
|
|
|
|
|
|
|
columns_description.resize(columns_count);
|
|
|
|
}
|
|
|
|
|
|
|
|
void StorageCatBoostPool::parseColumnDescription()
|
|
|
|
{
|
|
|
|
/// NOTE: simple parsing
|
|
|
|
/// TODO: use ReadBufferFromFile
|
|
|
|
|
|
|
|
checkDatasetDescription();
|
|
|
|
|
|
|
|
std::ifstream in(column_description_file_name);
|
|
|
|
if (!in.good())
|
|
|
|
throw Exception("Cannot open file: " + column_description_file_name, ErrorCodes::CANNOT_OPEN_FILE);
|
|
|
|
|
|
|
|
std::string line;
|
|
|
|
size_t line_num = 0;
|
|
|
|
auto column_types_map = getColumnTypesMap();
|
|
|
|
auto column_types_string = getColumnTypesString(column_types_map);
|
|
|
|
|
2017-10-31 12:22:42 +00:00
|
|
|
/// Enumerate default names for columns as Auxiliary, Auxiliary1, Auxiliary2, ...
|
|
|
|
std::map<DatasetColumnType, size_t> columns_per_type_count;
|
|
|
|
|
2017-10-30 14:38:14 +00:00
|
|
|
while (std::getline(in, line))
|
|
|
|
{
|
|
|
|
++line_num;
|
|
|
|
std::string str_line_num = std::to_string(line_num);
|
|
|
|
|
|
|
|
if (line.empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
std::istringstream iss(line);
|
|
|
|
std::vector<std::string> tokens;
|
|
|
|
std::string token;
|
|
|
|
while (std::getline(iss, token, '\t'))
|
|
|
|
tokens.push_back(token);
|
|
|
|
|
2017-10-30 15:12:56 +00:00
|
|
|
if (tokens.size() != 2 && tokens.size() != 3)
|
2017-10-30 14:38:14 +00:00
|
|
|
throw Exception("Cannot parse column description at line " + str_line_num + " '" + line + "' "
|
|
|
|
+ ": expected 2 or 3 columns, got " + std::to_string(tokens.size()),
|
|
|
|
ErrorCodes::CANNOT_PARSE_TEXT);
|
|
|
|
|
|
|
|
std::string str_id = tokens[0];
|
|
|
|
std::string col_type = tokens[1];
|
2017-10-31 12:22:42 +00:00
|
|
|
std::string col_alias = tokens.size() > 2 ? tokens[2] : "";
|
2017-10-30 14:38:14 +00:00
|
|
|
|
|
|
|
size_t num_id;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
num_id = std::stoull(str_id);
|
|
|
|
}
|
|
|
|
catch (std::exception & e)
|
|
|
|
{
|
|
|
|
throw Exception("Cannot parse column index at row " + str_line_num + ": " + e.what(),
|
|
|
|
ErrorCodes::CANNOT_PARSE_TEXT);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (num_id >= columns_description.size())
|
2018-11-26 00:56:50 +00:00
|
|
|
throw Exception("Invalid index at row " + str_line_num + ": " + str_id
|
2017-10-30 14:38:14 +00:00
|
|
|
+ ", expected in range [0, " + std::to_string(columns_description.size()) + ")",
|
|
|
|
ErrorCodes::CANNOT_PARSE_TEXT);
|
|
|
|
|
|
|
|
if (column_types_map.count(col_type) == 0)
|
|
|
|
throw Exception("Invalid column type: " + col_type + ", expected: " + column_types_string,
|
|
|
|
ErrorCodes::CANNOT_PARSE_TEXT);
|
|
|
|
|
|
|
|
auto type = column_types_map[col_type];
|
2017-10-31 12:22:42 +00:00
|
|
|
|
|
|
|
std::string col_name;
|
2017-11-10 11:25:25 +00:00
|
|
|
|
|
|
|
bool is_feature_column = type == DatasetColumnType::Num || type == DatasetColumnType::Categ;
|
|
|
|
auto & col_number = columns_per_type_count[type];
|
|
|
|
/// If column is not feature skip '0' after the name (to use 'Target' instead of 'Target0').
|
|
|
|
col_name = col_type + (is_feature_column || col_number ? std::to_string(col_number) : "");
|
|
|
|
++col_number;
|
|
|
|
|
2017-10-31 12:22:42 +00:00
|
|
|
columns_description[num_id] = ColumnDescription(col_name, col_alias, type);
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void StorageCatBoostPool::createSampleBlockAndColumns()
|
|
|
|
{
|
2018-03-13 14:18:11 +00:00
|
|
|
ColumnsDescription columns;
|
2017-12-25 21:57:29 +00:00
|
|
|
NamesAndTypesList cat_columns;
|
|
|
|
NamesAndTypesList num_columns;
|
2019-03-14 15:20:51 +00:00
|
|
|
NamesAndTypesList other_columns;
|
2017-10-30 14:38:14 +00:00
|
|
|
sample_block.clear();
|
2019-03-14 15:20:51 +00:00
|
|
|
|
|
|
|
auto get_type = [](DatasetColumnType column_type) -> DataTypePtr
|
2017-10-30 14:38:14 +00:00
|
|
|
{
|
2019-03-14 15:20:51 +00:00
|
|
|
if (column_type == DatasetColumnType::Categ
|
|
|
|
|| column_type == DatasetColumnType::Auxiliary
|
|
|
|
|| column_type == DatasetColumnType::DocId)
|
|
|
|
return std::make_shared<DataTypeString>();
|
2017-10-30 14:38:14 +00:00
|
|
|
else
|
2019-03-14 15:20:51 +00:00
|
|
|
return std::make_shared<DataTypeFloat64>();
|
|
|
|
};
|
|
|
|
|
|
|
|
for (auto & desc : columns_description)
|
|
|
|
{
|
|
|
|
DataTypePtr type = get_type(desc.column_type);
|
2017-10-30 14:38:14 +00:00
|
|
|
|
|
|
|
if (desc.column_type == DatasetColumnType::Categ)
|
|
|
|
cat_columns.emplace_back(desc.column_name, type);
|
2017-10-30 16:03:56 +00:00
|
|
|
else if (desc.column_type == DatasetColumnType::Num)
|
2017-10-30 14:38:14 +00:00
|
|
|
num_columns.emplace_back(desc.column_name, type);
|
2017-10-30 16:03:56 +00:00
|
|
|
else
|
2019-03-14 15:20:51 +00:00
|
|
|
other_columns.emplace_back(desc.column_name, type);
|
|
|
|
|
|
|
|
sample_block.insert(ColumnWithTypeAndName(type, desc.column_name));
|
|
|
|
}
|
2017-10-30 16:03:56 +00:00
|
|
|
|
2019-03-14 15:20:51 +00:00
|
|
|
/// Order is important: first numeric columns, then categorial, then all others.
|
|
|
|
for (const auto & column : num_columns)
|
|
|
|
columns.add(DB::ColumnDescription(column.name, column.type));
|
|
|
|
for (const auto & column : cat_columns)
|
|
|
|
columns.add(DB::ColumnDescription(column.name, column.type));
|
|
|
|
for (const auto & column : other_columns)
|
|
|
|
{
|
|
|
|
DB::ColumnDescription column_desc(column.name, column.type);
|
|
|
|
/// We assign Materialized kind to the column so that it doesn't show in SELECT *.
|
|
|
|
/// Because the table is readonly, we do not need default expression.
|
|
|
|
column_desc.default_desc.kind = ColumnDefaultKind::Materialized;
|
|
|
|
columns.add(std::move(column_desc));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto & desc : columns_description)
|
|
|
|
{
|
2017-10-31 12:22:42 +00:00
|
|
|
if (!desc.alias.empty())
|
|
|
|
{
|
2019-03-14 15:20:51 +00:00
|
|
|
DB::ColumnDescription column(desc.alias, get_type(desc.column_type));
|
|
|
|
column.default_desc.kind = ColumnDefaultKind::Alias;
|
|
|
|
column.default_desc.expression = std::make_shared<ASTIdentifier>(desc.column_name);
|
|
|
|
columns.add(std::move(column));
|
2017-10-31 12:22:42 +00:00
|
|
|
}
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
2018-03-13 14:18:11 +00:00
|
|
|
|
|
|
|
setColumns(columns);
|
2017-10-30 14:38:14 +00:00
|
|
|
}
|
|
|
|
|
2019-02-18 23:38:44 +00:00
|
|
|
BlockInputStreams StorageCatBoostPool::read(
|
|
|
|
const Names & column_names,
|
|
|
|
const SelectQueryInfo & /*query_info*/,
|
|
|
|
const Context & context,
|
|
|
|
QueryProcessingStage::Enum /*processed_stage*/,
|
|
|
|
size_t max_block_size,
|
|
|
|
unsigned /*threads*/)
|
2017-10-30 14:38:14 +00:00
|
|
|
{
|
|
|
|
auto stream = std::make_shared<CatBoostDatasetBlockInputStream>(
|
|
|
|
data_description_file_name, "TSV", sample_block, context, max_block_size);
|
|
|
|
|
|
|
|
auto filter_stream = std::make_shared<FilterColumnsBlockInputStream>(stream, column_names, false);
|
|
|
|
return { filter_stream };
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|