This commit is contained in:
Nikita Mikhailov 2021-02-10 17:16:27 +03:00
parent 39c73cb448
commit 6c9f5e4991
5 changed files with 32 additions and 3 deletions

View File

@ -15,6 +15,12 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size))
{
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
if (current_object_size > 10 * min_chunk_size)
throw ParsingException("Size of JSON object is extremely large. Expected not greater than " +
std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) +
" bytes. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually", ErrorCodes::INCORRECT_DATA);
if (quotes)
{
pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end());

View File

@ -1104,9 +1104,9 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current)
assert(current >= in.position());
assert(current <= in.buffer().end());
const int old_bytes = memory.size();
const int additional_bytes = current - in.position();
const int new_bytes = old_bytes + additional_bytes;
const size_t old_bytes = memory.size();
const size_t additional_bytes = current - in.position();
const size_t new_bytes = old_bytes + additional_bytes;
/// There are no new bytes to add to memory.
/// No need to do extra stuff.
if (new_bytes == 0)

View File

@ -0,0 +1,13 @@
#!/usr/bin/env python3
import os
import sys
import random
import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
from pure_http_client import ClickHouseClient

View File

@ -0,0 +1,9 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
python3 -c "print('{{\"a\":\"{}\", \"b\":\"{}\"}}'.format('clickhouse'* 10000000, 'dbms' * 100000000))" > big_json.json
clickhouse-local --input_format_parallel_parsing=1 --max_memory_usage=0 -q "select count() from file('big_json.json', 'JSONEachRow', 'a String, b String')" 2>&1 | grep -q "min_chunk_bytes_for_parallel_parsing" && echo "Ok." || echo "FAIL" ||: