From 47eb0e28b39e563ae6fe6d4184cbd03e4ff8fa23 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Fri, 14 Sep 2018 12:15:32 +0000 Subject: [PATCH] Nested JSON data is mapped to nested table data --- .../src/Formats/JSONEachRowRowInputStream.cpp | 63 +++++++++++++------ dbms/src/Formats/JSONEachRowRowInputStream.h | 15 ++++- dbms/src/IO/ReadHelpers.cpp | 1 + ...00715_json_each_row_input_nested.reference | 5 ++ .../00715_json_each_row_input_nested.sh | 12 ++++ 5 files changed, 75 insertions(+), 21 deletions(-) diff --git a/dbms/src/Formats/JSONEachRowRowInputStream.cpp b/dbms/src/Formats/JSONEachRowRowInputStream.cpp index 717103f379b..6e916b5fa10 100644 --- a/dbms/src/Formats/JSONEachRowRowInputStream.cpp +++ b/dbms/src/Formats/JSONEachRowRowInputStream.cpp @@ -3,7 +3,7 @@ #include #include #include - +#include namespace DB { @@ -14,6 +14,17 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } +namespace +{ + +enum +{ + UNKNOWN_FIELD = size_t(-1), + NESTED_FIELD = size_t(-2) +}; + +} // unnamed namespace + JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings) : istr(istr_), header(header_), format_settings(format_settings), name_map(header.columns()) @@ -23,7 +34,16 @@ JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const B size_t num_columns = header.columns(); for (size_t i = 0; i < num_columns; ++i) - name_map[columnName(i)] = i; /// NOTE You could place names more cache-locally. + { + const String& colname = columnName(i); + name_map[colname] = i; /// NOTE You could place names more cache-locally. + const auto splitted = Nested::splitName(colname); + if ( ! splitted.second.empty() ) + { + const StringRef table_name(colname.data(), splitted.first.size()); + name_map[table_name] = NESTED_FIELD; + } + } } const String& JSONEachRowRowInputStream::columnName(size_t i) const @@ -31,16 +51,6 @@ const String& JSONEachRowRowInputStream::columnName(size_t i) const return header.safeGetByPosition(i).name; } -namespace -{ - -enum -{ - UNKNOWN_FIELD = size_t(-1) -}; - -} // unnamed namespace - size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const { /// NOTE Optimization is possible by caching the order of fields (which is almost always the same) @@ -50,13 +60,13 @@ size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const return name_map.end() == it ? UNKNOWN_FIELD : it->second; } -/** Read the field name in JSON format. - * A reference to the field name will be written to ref. - * You can also use temporary `tmp` buffer to copy field name there. +/** Read the field name and convert it to column name + * (taking into account the current nested name prefix) */ -static StringRef readName(ReadBuffer & buf, String & tmp) +StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf) { - if (buf.position() + 1 < buf.buffer().end()) + // This is just an optimization: try to avoid copying the name into current_column_name + if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end()) { const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end()); @@ -71,8 +81,9 @@ static StringRef readName(ReadBuffer & buf, String & tmp) } } - readJSONString(tmp, buf); - return tmp; + current_column_name.resize(nested_prefix_length); + readJSONStringInto(current_column_name, buf); + return current_column_name; } @@ -135,18 +146,29 @@ void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns) for ( size_t key_index = 0 ; advanceToNextKey(key_index) ; ++key_index ) { - StringRef name_ref = readName(istr, name_buf); + StringRef name_ref = readColumnName(istr); skipColonDelimeter(istr); const size_t column_index = columnIndex(name_ref); if ( column_index == UNKNOWN_FIELD ) skipUnknownField(name_ref); + else if ( column_index == NESTED_FIELD ) + readNestedData(name_ref.toString(), columns); else readField(column_index, columns); } } +void JSONEachRowRowInputStream::readNestedData(const String& name, MutableColumns & columns) +{ + current_column_name = name; + current_column_name.push_back('.'); + nested_prefix_length = current_column_name.size(); + readJSONObject(columns); + nested_prefix_length = 0; +} + bool JSONEachRowRowInputStream::read(MutableColumns & columns) { skipWhitespaceIfAny(istr); @@ -170,6 +192,7 @@ bool JSONEachRowRowInputStream::read(MutableColumns & columns) /// TODO Ability to provide your DEFAULTs. read_columns.assign(num_columns, false); + nested_prefix_length = 0; readJSONObject(columns); /// Fill non-visited columns with the default values. diff --git a/dbms/src/Formats/JSONEachRowRowInputStream.h b/dbms/src/Formats/JSONEachRowRowInputStream.h index f1cd0f3b41e..c23d398400a 100644 --- a/dbms/src/Formats/JSONEachRowRowInputStream.h +++ b/dbms/src/Formats/JSONEachRowRowInputStream.h @@ -31,8 +31,10 @@ private: size_t columnIndex(const StringRef& name) const; bool advanceToNextKey(size_t key_index); void skipUnknownField(const StringRef& name_ref); + StringRef readColumnName(ReadBuffer & buf); void readField(size_t index, MutableColumns & columns); void readJSONObject(MutableColumns & columns); + void readNestedData(const String& name, MutableColumns & columns); private: ReadBuffer & istr; @@ -41,7 +43,18 @@ private: const FormatSettings format_settings; /// Buffer for the read from the stream field name. Used when you have to copy it. - String name_buf; + /// Also, if processing of Nested data is in progress, it holds the common prefix + /// of the nested column names (so that appending the field name to it produces + /// the full column name) + String current_column_name; + + /// If processing Nested data, holds the length of the common prefix + /// of the names of related nested columns. For example, for a table + /// created as follows + /// CREATE TABLE t (n Nested (i Int32, s String)) + /// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.' + size_t nested_prefix_length = 0; + std::vector read_columns; /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. diff --git a/dbms/src/IO/ReadHelpers.cpp b/dbms/src/IO/ReadHelpers.cpp index 9b8b28f5def..fb5b89babe2 100644 --- a/dbms/src/IO/ReadHelpers.cpp +++ b/dbms/src/IO/ReadHelpers.cpp @@ -667,6 +667,7 @@ void readJSONString(String & s, ReadBuffer & buf) template void readJSONStringInto, void>(PaddedPODArray & s, ReadBuffer & buf); template bool readJSONStringInto, bool>(PaddedPODArray & s, ReadBuffer & buf); template void readJSONStringInto(NullSink & s, ReadBuffer & buf); +template void readJSONStringInto(String & s, ReadBuffer & buf); template diff --git a/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.reference b/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.reference index 73f7522fb37..c50bdb7769a 100644 --- a/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.reference +++ b/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.reference @@ -3,3 +3,8 @@ 0 [] [45,67,8] 1 ok ['dog','cat','pig'] [3,3,3] 1 ok ['zero','negative one'] [0,-1] +1 ok ['abc','def'] [1,23] +0 [] [] +0 [] [45,67,8] +1 ok ['dog','cat','pig'] [3,3,3] +1 ok ['zero','negative one'] [0,-1] diff --git a/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.sh b/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.sh index 4d7514bdf4c..685386a51bf 100755 --- a/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.sh +++ b/dbms/tests/queries/0_stateless/00715_json_each_row_input_nested.sh @@ -17,3 +17,15 @@ echo '{"d1" : 1, "d2" : "ok", "n.s" : ["abc", "def"], "n.i" : [1, 23]} $CLICKHOUSE_CLIENT --max_threads=1 -q "SELECT * FROM test.json_each_row_nested" $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.json_each_row_nested" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE test.json_each_row_nested (d1 UInt8, d2 String, n Nested (s String, i Int32) ) ENGINE = Memory" + +echo '{"d1" : 1, "d2" : "ok", "n" : { "s" : ["abc", "def"], "i" : [1, 23]} } +{ } +{"t1" : 0, "n.t2":true,"n" : {"i":[45, 67, 8]}, "t4":null,"t5":[],"t6":"trash" } +{"d2":"ok","n" : {"s":["dog", "cat", "pig"], "x":[["1","2"]]}, "d1":"1", "n.i":[3, 3, 3]} +{"t0" : -0.1, "n": {"s" : ["zero","negative one"], "i" : [0, -1]}, "d2" : "ok", "d1" : 1}' \ +| $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 -q "INSERT INTO test.json_each_row_nested FORMAT JSONEachRow" + +$CLICKHOUSE_CLIENT --max_threads=1 -q "SELECT * FROM test.json_each_row_nested" +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.json_each_row_nested"