From 7c9dbdbd9c01167a49767ab31cc700f3a91421ee Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Fri, 22 Mar 2024 15:32:43 +0800 Subject: [PATCH] Improve json read by ignore key case --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Impl/JSONEachRowRowInputFormat.cpp | 12 ++++++++++++ .../Formats/Impl/JSONEachRowRowInputFormat.h | 8 +++++++- .../03013_json_key_ignore_case.reference | 3 +++ .../0_stateless/03013_json_key_ignore_case.sh | 18 ++++++++++++++++++ .../data_json/key_ignore_case.json | Bin 0 -> 123 bytes 8 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03013_json_key_ignore_case.reference create mode 100644 tests/queries/0_stateless/03013_json_key_ignore_case.sh create mode 100644 tests/queries/0_stateless/data_json/key_ignore_case.json diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 375bdb1c516..777bed7b7ed 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1057,6 +1057,7 @@ class IColumn; M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \ M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \ M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \ + M(Bool, input_format_json_ignore_key_case, false, "Ignore json key case while read json field from string", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 7d7a2b17a8a..ed324515697 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -145,6 +145,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects; format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence; format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields; + format_settings.json.ignore_key_case = settings.input_format_json_ignore_key_case; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index da225a39ec9..136381deff9 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -224,6 +224,7 @@ struct FormatSettings bool infer_incomplete_types_as_strings = true; bool throw_on_bad_escape_sequence = true; bool ignore_unnecessary_fields = true; + bool ignore_key_case = false; } json{}; struct diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index a78d8d016cd..8570fb2e75b 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -45,6 +45,14 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat( { const auto & header = getPort().getHeader(); name_map = header.getNamesToIndexesMap(); + if (format_settings_.json.ignore_key_case) + { + for (auto it = name_map.begin(); it != name_map.end(); ++it) + { + StringRef key = it->first; + transformFieldNameToLowerCase(key); + } + } if (format_settings_.import_nested_json) { for (size_t i = 0; i != header.columns(); ++i) @@ -169,6 +177,10 @@ void JSONEachRowRowInputFormat::readJSONObject(MutableColumns & columns) JSONUtils::skipColon(*in); skipUnknownField(name_ref); continue; + } + if (format_settings.json.ignore_key_case) + { + transformFieldNameToLowerCase(name_ref); } const size_t column_index = columnIndex(name_ref, key_index); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index d97aa2dad8d..24d585d07bc 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -55,7 +55,13 @@ private: virtual void readRowStart(MutableColumns &) {} virtual void skipRowStart() {} - + void transformFieldNameToLowerCase(StringRef & field_name) + { + if (!field_name.data) return; + char * name_data = const_cast(field_name.data); + for (size_t i = 0; i < field_name.size; ++i) + name_data[i] = std::tolower(name_data[i]); + } /// Buffer for the read from the stream field name. Used when you have to copy it. /// Also, if processing of Nested data is in progress, it holds the common prefix /// of the nested column names (so that appending the field name to it produces diff --git a/tests/queries/0_stateless/03013_json_key_ignore_case.reference b/tests/queries/0_stateless/03013_json_key_ignore_case.reference new file mode 100644 index 00000000000..54683d8fbc5 --- /dev/null +++ b/tests/queries/0_stateless/03013_json_key_ignore_case.reference @@ -0,0 +1,3 @@ +1 77328912 Ben +2 77328913 Jim +3 77328914 Bill diff --git a/tests/queries/0_stateless/03013_json_key_ignore_case.sh b/tests/queries/0_stateless/03013_json_key_ignore_case.sh new file mode 100644 index 00000000000..807e743b22a --- /dev/null +++ b/tests/queries/0_stateless/03013_json_key_ignore_case.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep -E '^Code: 107.*FILE_DOESNT_EXIST' | head -1 | awk '{gsub("/nonexist.txt","",$9); print $9}') + +cp "$CURDIR"/data_json/key_ignore_case.json $USER_FILES_PATH/ + +$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl" +$CLICKHOUSE_CLIENT -q "create table test_tbl (id UInt16, reqid UInt32, name String) engine=MergeTree order by id" +$CLICKHOUSE_CLIENT -q "INSERT INTO test_tbl SELECT * FROM file('key_ignore_case.json', 'JSONEachRow') SETTINGS input_format_json_ignore_key_case=true" +$CLICKHOUSE_CLIENT -q "select * from test_tbl" +$CLICKHOUSE_CLIENT -q "drop table test_tbl" \ No newline at end of file diff --git a/tests/queries/0_stateless/data_json/key_ignore_case.json b/tests/queries/0_stateless/data_json/key_ignore_case.json new file mode 100644 index 0000000000000000000000000000000000000000..ad8f7cb450780891d64ac8cbbc19de17b92e7db5 GIT binary patch literal 123 zcmbzd5O8HN>)lvsd-AZT-9KOAYoL6#t4O8 VnYnO