Fix single carriage return processing in TSV file segmentation engine

This commit is contained in:
avogar 2023-08-14 11:33:41 +00:00
parent 47b551180b
commit 4c12914406
3 changed files with 18 additions and 5 deletions

View File

@ -411,11 +411,6 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
continue;
}
++number_of_rows;
if ((number_of_rows >= min_rows)
&& ((memory.size() + static_cast<size_t>(pos - in.position()) >= min_bytes) || (number_of_rows == max_rows)))
need_more_data = false;
if (*pos == '\n')
{
++pos;
@ -427,7 +422,14 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
++pos;
if (loadAtPosition(in, memory, pos) && *pos == '\n')
++pos;
else
continue;
}
++number_of_rows;
if ((number_of_rows >= min_rows)
&& ((memory.size() + static_cast<size_t>(pos - in.position()) >= min_bytes) || (number_of_rows == max_rows)))
need_more_data = false;
}
saveUpToPosition(in, memory, pos);

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select 'Hello\rWorld' from numbers(1000000) format TSVRaw" > $CLICKHOUSE_TEST_UNIQUE_NAME.tsv
$CLICKHOUSE_LOCAL -q "select count() from file('$CLICKHOUSE_TEST_UNIQUE_NAME.tsv')"
rm $CLICKHOUSE_TEST_UNIQUE_NAME.tsv