Added ExecutablePool documentation

This commit is contained in:
Maksim Kita 2021-06-11 11:28:49 +03:00
parent e549b29bff
commit 12982da130
5 changed files with 40 additions and 8 deletions

View File

@ -121,6 +121,36 @@ Setting fields:
- `command` The absolute path to the executable file, or the file name (if the program directory is written to `PATH`).
- `format` The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)” are supported.
- `implicit_key` - The executable source file can return only values, and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. Default value is false.
That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node.
## Executable Pool {#dicts-external_dicts_dict_sources-executable_pool}
Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary is stored using `cache`, `ssd_cache`, `direct` layouts. Executable pool will spawn pool of processes with specified command and keep them running until they exit. The program should read data from STDIN while it is available and output result to STDOUT, and it can wait for next block of data on stdin. ClickHouse will not close STDIN after processing a block of data but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing - it should poll STDIN and flush data to STDOUT early.
Example of settings:
``` xml
<source>
<executable_pool>
<command>cat /opt/dictionaries/os.tsv</command>
<format>TabSeparated</format>
<pool_size>10</pool_size>
<max_command_execution_time>10<max_command_execution_time>
<implicit_key>false</implicit_key>
</executable_pool>
</source>
```
Setting fields:
- `command` The absolute path to the executable file, or the file name (if the program directory is written to `PATH`).
- `format` The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)” are supported.
- `pool_size` - Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions.
- `command_termination_timeout` - Executable pool script, should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have command_termination_timeout seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter.
- `max_command_execution_time` - Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter.
- `implicit_key` - The executable source file can return only values, and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. Default value is false. Optional parameter.
That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node.

View File

@ -71,8 +71,6 @@ ExecutableDictionarySource::ExecutableDictionarySource(
{
/// Remove keys from sample_block for implicit_key dictionary because
/// these columns will not be returned from source
/// Implicit key means that the source script will return only values,
/// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result.
if (configuration.implicit_key)
{
auto keys_names = dict_struct.getKeysNames();
@ -277,11 +275,11 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory)
ExecutableDictionarySource::Configuration configuration
{
.implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false),
.command = config.getString(settings_config_prefix + ".command"),
.format = config.getString(settings_config_prefix + ".format"),
.update_field = config.getString(settings_config_prefix + ".update_field", ""),
.update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1),
.implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false)
};
return std::make_unique<ExecutableDictionarySource>(dict_struct, configuration, sample_block, context_local_copy);

View File

@ -18,11 +18,13 @@ public:
struct Configuration
{
bool implicit_key;
const std::string command;
const std::string format;
const std::string update_field;
const UInt64 update_lag;
/// Implicit key means that the source script will return only values,
/// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result.
const bool implicit_key;
};
ExecutableDictionarySource(

View File

@ -308,9 +308,9 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory)
.command = config.getString(settings_config_prefix + ".command"),
.format = config.getString(settings_config_prefix + ".format"),
.pool_size = config.getUInt64(settings_config_prefix + ".size"),
.implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false),
.command_termination_timeout = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10),
.max_command_execution_time = max_command_execution_time
.max_command_execution_time = max_command_execution_time,
.implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false),
};
return std::make_unique<ExecutablePoolDictionarySource>(dict_struct, configuration, sample_block, context_local_copy);

View File

@ -22,7 +22,7 @@ using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
* It is important that stream format will expect only rows that were requested.
* When stream is finished process is returned back to the ProcessPool.
* If there are no processes in pool during request client will be blocked
* until some process will be retunred to pool.
* until some process will be returned to pool.
*/
class ExecutablePoolDictionarySource final : public IDictionarySource
{
@ -32,9 +32,11 @@ public:
const String command;
const String format;
const size_t pool_size;
const bool implicit_key;
const size_t command_termination_timeout;
const size_t max_command_execution_time;
/// Implicit key means that the source script will return only values,
/// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result.
const bool implicit_key;
};
ExecutablePoolDictionarySource(