Nested JSON data is mapped to nested table data

This commit is contained in:
Veloman Yunkan 2018-09-14 12:15:32 +00:00
parent 8d427dd09d
commit 47eb0e28b3
5 changed files with 75 additions and 21 deletions

View File

@ -3,7 +3,7 @@
#include <Formats/JSONEachRowRowInputStream.h>
#include <Formats/FormatFactory.h>
#include <Formats/BlockInputStreamFromRowInputStream.h>
#include <DataTypes/NestedUtils.h>
namespace DB
{
@ -14,6 +14,17 @@ namespace ErrorCodes
extern const int CANNOT_READ_ALL_DATA;
}
namespace
{
enum
{
UNKNOWN_FIELD = size_t(-1),
NESTED_FIELD = size_t(-2)
};
} // unnamed namespace
JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings)
: istr(istr_), header(header_), format_settings(format_settings), name_map(header.columns())
@ -23,7 +34,16 @@ JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const B
size_t num_columns = header.columns();
for (size_t i = 0; i < num_columns; ++i)
name_map[columnName(i)] = i; /// NOTE You could place names more cache-locally.
{
const String& colname = columnName(i);
name_map[colname] = i; /// NOTE You could place names more cache-locally.
const auto splitted = Nested::splitName(colname);
if ( ! splitted.second.empty() )
{
const StringRef table_name(colname.data(), splitted.first.size());
name_map[table_name] = NESTED_FIELD;
}
}
}
const String& JSONEachRowRowInputStream::columnName(size_t i) const
@ -31,16 +51,6 @@ const String& JSONEachRowRowInputStream::columnName(size_t i) const
return header.safeGetByPosition(i).name;
}
namespace
{
enum
{
UNKNOWN_FIELD = size_t(-1)
};
} // unnamed namespace
size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const
{
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
@ -50,13 +60,13 @@ size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const
return name_map.end() == it ? UNKNOWN_FIELD : it->second;
}
/** Read the field name in JSON format.
* A reference to the field name will be written to ref.
* You can also use temporary `tmp` buffer to copy field name there.
/** Read the field name and convert it to column name
* (taking into account the current nested name prefix)
*/
static StringRef readName(ReadBuffer & buf, String & tmp)
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
{
if (buf.position() + 1 < buf.buffer().end())
// This is just an optimization: try to avoid copying the name into current_column_name
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
{
const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
@ -71,8 +81,9 @@ static StringRef readName(ReadBuffer & buf, String & tmp)
}
}
readJSONString(tmp, buf);
return tmp;
current_column_name.resize(nested_prefix_length);
readJSONStringInto(current_column_name, buf);
return current_column_name;
}
@ -135,18 +146,29 @@ void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
for ( size_t key_index = 0 ; advanceToNextKey(key_index) ; ++key_index )
{
StringRef name_ref = readName(istr, name_buf);
StringRef name_ref = readColumnName(istr);
skipColonDelimeter(istr);
const size_t column_index = columnIndex(name_ref);
if ( column_index == UNKNOWN_FIELD )
skipUnknownField(name_ref);
else if ( column_index == NESTED_FIELD )
readNestedData(name_ref.toString(), columns);
else
readField(column_index, columns);
}
}
void JSONEachRowRowInputStream::readNestedData(const String& name, MutableColumns & columns)
{
current_column_name = name;
current_column_name.push_back('.');
nested_prefix_length = current_column_name.size();
readJSONObject(columns);
nested_prefix_length = 0;
}
bool JSONEachRowRowInputStream::read(MutableColumns & columns)
{
skipWhitespaceIfAny(istr);
@ -170,6 +192,7 @@ bool JSONEachRowRowInputStream::read(MutableColumns & columns)
/// TODO Ability to provide your DEFAULTs.
read_columns.assign(num_columns, false);
nested_prefix_length = 0;
readJSONObject(columns);
/// Fill non-visited columns with the default values.

View File

@ -31,8 +31,10 @@ private:
size_t columnIndex(const StringRef& name) const;
bool advanceToNextKey(size_t key_index);
void skipUnknownField(const StringRef& name_ref);
StringRef readColumnName(ReadBuffer & buf);
void readField(size_t index, MutableColumns & columns);
void readJSONObject(MutableColumns & columns);
void readNestedData(const String& name, MutableColumns & columns);
private:
ReadBuffer & istr;
@ -41,7 +43,18 @@ private:
const FormatSettings format_settings;
/// Buffer for the read from the stream field name. Used when you have to copy it.
String name_buf;
/// Also, if processing of Nested data is in progress, it holds the common prefix
/// of the nested column names (so that appending the field name to it produces
/// the full column name)
String current_column_name;
/// If processing Nested data, holds the length of the common prefix
/// of the names of related nested columns. For example, for a table
/// created as follows
/// CREATE TABLE t (n Nested (i Int32, s String))
/// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.'
size_t nested_prefix_length = 0;
std::vector<bool> read_columns;
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.

View File

@ -667,6 +667,7 @@ void readJSONString(String & s, ReadBuffer & buf)
template void readJSONStringInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template bool readJSONStringInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readJSONStringInto<NullSink>(NullSink & s, ReadBuffer & buf);
template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
template <typename ReturnType>

View File

@ -3,3 +3,8 @@
0 [] [45,67,8]
1 ok ['dog','cat','pig'] [3,3,3]
1 ok ['zero','negative one'] [0,-1]
1 ok ['abc','def'] [1,23]
0 [] []
0 [] [45,67,8]
1 ok ['dog','cat','pig'] [3,3,3]
1 ok ['zero','negative one'] [0,-1]

View File

@ -17,3 +17,15 @@ echo '{"d1" : 1, "d2" : "ok", "n.s" : ["abc", "def"], "n.i" : [1, 23]}
$CLICKHOUSE_CLIENT --max_threads=1 -q "SELECT * FROM test.json_each_row_nested"
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.json_each_row_nested"
$CLICKHOUSE_CLIENT -q "CREATE TABLE test.json_each_row_nested (d1 UInt8, d2 String, n Nested (s String, i Int32) ) ENGINE = Memory"
echo '{"d1" : 1, "d2" : "ok", "n" : { "s" : ["abc", "def"], "i" : [1, 23]} }
{ }
{"t1" : 0, "n.t2":true,"n" : {"i":[45, 67, 8]}, "t4":null,"t5":[],"t6":"trash" }
{"d2":"ok","n" : {"s":["dog", "cat", "pig"], "x":[["1","2"]]}, "d1":"1", "n.i":[3, 3, 3]}
{"t0" : -0.1, "n": {"s" : ["zero","negative one"], "i" : [0, -1]}, "d2" : "ok", "d1" : 1}' \
| $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 -q "INSERT INTO test.json_each_row_nested FORMAT JSONEachRow"
$CLICKHOUSE_CLIENT --max_threads=1 -q "SELECT * FROM test.json_each_row_nested"
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.json_each_row_nested"