Improve json read by ignore key case

This commit is contained in:
kevinyhzou 2024-03-22 15:32:43 +08:00
parent 2ff562ebe2
commit 7c9dbdbd9c
8 changed files with 43 additions and 1 deletions

View File

@ -1057,6 +1057,7 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, input_format_json_ignore_key_case, false, "Ignore json key case while read json field from string", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

View File

@ -145,6 +145,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects;
format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence;
format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields;
format_settings.json.ignore_key_case = settings.input_format_json_ignore_key_case;
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size;

View File

@ -224,6 +224,7 @@ struct FormatSettings
bool infer_incomplete_types_as_strings = true;
bool throw_on_bad_escape_sequence = true;
bool ignore_unnecessary_fields = true;
bool ignore_key_case = false;
} json{};
struct

View File

@ -45,6 +45,14 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat(
{
const auto & header = getPort().getHeader();
name_map = header.getNamesToIndexesMap();
if (format_settings_.json.ignore_key_case)
{
for (auto it = name_map.begin(); it != name_map.end(); ++it)
{
StringRef key = it->first;
transformFieldNameToLowerCase(key);
}
}
if (format_settings_.import_nested_json)
{
for (size_t i = 0; i != header.columns(); ++i)
@ -170,6 +178,10 @@ void JSONEachRowRowInputFormat::readJSONObject(MutableColumns & columns)
skipUnknownField(name_ref);
continue;
}
if (format_settings.json.ignore_key_case)
{
transformFieldNameToLowerCase(name_ref);
}
const size_t column_index = columnIndex(name_ref, key_index);
if (unlikely(ssize_t(column_index) < 0))

View File

@ -55,7 +55,13 @@ private:
virtual void readRowStart(MutableColumns &) {}
virtual void skipRowStart() {}
void transformFieldNameToLowerCase(StringRef & field_name)
{
if (!field_name.data) return;
char * name_data = const_cast<char*>(field_name.data);
for (size_t i = 0; i < field_name.size; ++i)
name_data[i] = std::tolower(name_data[i]);
}
/// Buffer for the read from the stream field name. Used when you have to copy it.
/// Also, if processing of Nested data is in progress, it holds the common prefix
/// of the nested column names (so that appending the field name to it produces

View File

@ -0,0 +1,3 @@
1 77328912 Ben
2 77328913 Jim
3 77328914 Bill

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$($CLICKHOUSE_CLIENT --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep -E '^Code: 107.*FILE_DOESNT_EXIST' | head -1 | awk '{gsub("/nonexist.txt","",$9); print $9}')
cp "$CURDIR"/data_json/key_ignore_case.json $USER_FILES_PATH/
$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl"
$CLICKHOUSE_CLIENT -q "create table test_tbl (id UInt16, reqid UInt32, name String) engine=MergeTree order by id"
$CLICKHOUSE_CLIENT -q "INSERT INTO test_tbl SELECT * FROM file('key_ignore_case.json', 'JSONEachRow') SETTINGS input_format_json_ignore_key_case=true"
$CLICKHOUSE_CLIENT -q "select * from test_tbl"
$CLICKHOUSE_CLIENT -q "drop table test_tbl"