diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 6017f3983c6..980512c72d7 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -15,6 +15,12 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size)) { + const auto current_object_size = memory.size() + static_cast(pos - in.position()); + if (current_object_size > 10 * min_chunk_size) + throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + + std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + + " bytes. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually", ErrorCodes::INCORRECT_DATA); + if (quotes) { pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end()); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 5a159defe06..baa12297718 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1104,9 +1104,9 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current) assert(current >= in.position()); assert(current <= in.buffer().end()); - const int old_bytes = memory.size(); - const int additional_bytes = current - in.position(); - const int new_bytes = old_bytes + additional_bytes; + const size_t old_bytes = memory.size(); + const size_t additional_bytes = current - in.position(); + const size_t new_bytes = old_bytes + additional_bytes; /// There are no new bytes to add to memory. /// No need to do extra stuff. if (new_bytes == 0) diff --git a/tests/queries/0_stateless/01654_geometry_functions_benchmark.python b/tests/queries/0_stateless/01654_geometry_functions_benchmark.python new file mode 100644 index 00000000000..d1fe971af28 --- /dev/null +++ b/tests/queries/0_stateless/01654_geometry_functions_benchmark.python @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +import os +import sys +import random +import pandas as pd +import numpy as np + +CURDIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(CURDIR, 'helpers')) + +from pure_http_client import ClickHouseClient + + diff --git a/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference new file mode 100644 index 00000000000..587579af915 --- /dev/null +++ b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference @@ -0,0 +1 @@ +Ok. diff --git a/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh new file mode 100755 index 00000000000..2fea04c6abe --- /dev/null +++ b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +python3 -c "print('{{\"a\":\"{}\", \"b\":\"{}\"}}'.format('clickhouse'* 10000000, 'dbms' * 100000000))" > big_json.json + +clickhouse-local --input_format_parallel_parsing=1 --max_memory_usage=0 -q "select count() from file('big_json.json', 'JSONEachRow', 'a String, b String')" 2>&1 | grep -q "min_chunk_bytes_for_parallel_parsing" && echo "Ok." || echo "FAIL" ||: \ No newline at end of file