This commit is contained in:
Nikita Mikhaylov 2019-10-25 17:32:57 +03:00
parent 60ee52cb8d
commit 980528ed75
4 changed files with 28 additions and 7 deletions

View File

@ -108,9 +108,9 @@ struct Settings : public SettingsCollection<Settings>
M(SettingBool, distributed_group_by_no_merge, false, "Do not merge aggregation states from different servers for distributed query processing - in case it is for certain that there are different keys on different shards.") \
M(SettingBool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.") \
\
M(SettingBool, input_format_parallel_parsing, true, "Enable parallel parsing for several data formats (JSON, TSV, TKSV, Values, CSV).") \
M(SettingUInt64, max_threads_for_parallel_reading, 10, "The maximum number of threads to parallel reading. By default, it is set to max_threads.") \
M(SettingUInt64, min_chunk_size_for_parallel_reading, (1024 * 1024), "The minimum chunk size in bytes, which each thread tries to parse under mutex in parallel reading.") \
M(SettingBool, input_format_parallel_parsing, true, "Enable parallel parsing for several data formats (JSONEachRow, TSV, TKSV, CSV).") \
M(SettingUInt64, max_threads_for_parallel_parsing, 10, "The maximum number of threads to parallel parsing.") \
M(SettingUInt64, min_chunk_size_for_parallel_parsing, (1024 * 1024), "The minimum chunk size in bytes, which each thread will parse in parallel.") \
\
M(SettingUInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.") \
M(SettingUInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.") \

View File

@ -16,8 +16,8 @@ namespace DB
/**
* ORDER-PRESERVING parallel parsing of data formats.
* It splits original data into chunks. Then each chunk is parsed by different thread.
* The number of chunks equals to max_threads_for_parallel_reading setting.
* The size of chunk is equal to min_chunk_size_for_parallel_reading setting.
* The number of chunks equals to max_threads_for_parallel_parsing setting.
* The size of chunk is equal to min_chunk_size_for_parallel_parsing setting.
*/
class ParallelParsingBlockInputStream : public IBlockInputStream
{

View File

@ -124,12 +124,12 @@ BlockInputStreamPtr FormatFactory::getInput(
row_input_format_params.max_execution_time = settings.max_execution_time;
row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode;
size_t max_threads_to_use = settings.max_threads_for_parallel_reading;
size_t max_threads_to_use = settings.max_threads_for_parallel_parsing;
if (!max_threads_to_use)
max_threads_to_use = settings.max_threads;
auto params = ParallelParsingBlockInputStream::InputCreatorParams{sample, context, row_input_format_params, format_settings};
ParallelParsingBlockInputStream::Builder builder{buf, input_getter, params, file_segmentation_engine, max_threads_to_use, settings.min_chunk_size_for_parallel_reading};
ParallelParsingBlockInputStream::Builder builder{buf, input_getter, params, file_segmentation_engine, max_threads_to_use, settings.min_chunk_size_for_parallel_parsing};
return std::make_shared<ParallelParsingBlockInputStream>(builder);
}

View File

@ -980,4 +980,25 @@ Lower values mean higher priority. Threads with low `nice` priority values are e
Default value: 0.
## input_format_parallel_parsing
- Type: bool
- Default value: True
Enable order-preserving parallel parsing of data formats such as JSONEachRow, TSV, TKSV and CSV. Reading will be single threaded and parsing will be multithreaded.
## max_threads_for_parallel_parsing
- Type: unsigned int
- Default value: 10
The maximum number of threads for order-preserving parallel parsing of data formats.
## min_chunk_size_for_parallel_parsing
- Type: unsigned int
- Default value: 1024 * 1024
The minimum chunk size in bytes, which each thread will parse in parallel. By default it equals to one megabyte.
[Original article](https://clickhouse.yandex/docs/en/operations/settings/settings/) <!-- hide -->