Remove JSONEachRowRowInputStream.

2024-11-12 10:34:21 +00:00 · 2019-08-02 18:36:36 +03:00 · 2019-08-02 18:36:36 +03:00 · ae50ab6907
commit ae50ab6907
parent cd299bb162
4 changed files with 1 additions and 343 deletions
--- a/dbms/src/Formats/FormatFactory.cpp
+++ b/dbms/src/Formats/FormatFactory.cpp
@ -207,7 +207,6 @@ void registerInputFormatCSV(FormatFactory & factory);
 void registerOutputFormatCSV(FormatFactory & factory);
 void registerInputFormatTSKV(FormatFactory & factory);
 void registerOutputFormatTSKV(FormatFactory & factory);
-void registerInputFormatJSONEachRow(FormatFactory & factory);
 void registerOutputFormatJSONEachRow(FormatFactory & factory);
 void registerInputFormatParquet(FormatFactory & factory);
 void registerOutputFormatParquet(FormatFactory & factory);
@ -273,7 +272,6 @@ FormatFactory::FormatFactory()
    registerOutputFormatCSV(*this);
    registerInputFormatTSKV(*this);
    registerOutputFormatTSKV(*this);
-    registerInputFormatJSONEachRow(*this);
    registerOutputFormatJSONEachRow(*this);
    registerInputFormatProtobuf(*this);
    registerOutputFormatProtobuf(*this);
--- a/dbms/src/Formats/JSONEachRowRowInputStream.cpp
+++ b/dbms/src/Formats/JSONEachRowRowInputStream.cpp
@ -1,272 +0,0 @@
-#include <IO/ReadHelpers.h>
-
-#include <Formats/JSONEachRowRowInputStream.h>
-#include <Formats/FormatFactory.h>
-#include <Formats/BlockInputStreamFromRowInputStream.h>
-#include <DataTypes/NestedUtils.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int INCORRECT_DATA;
-    extern const int CANNOT_READ_ALL_DATA;
-    extern const int LOGICAL_ERROR;
-}
-
-namespace
-{
-
-enum
-{
-    UNKNOWN_FIELD = size_t(-1),
-    NESTED_FIELD = size_t(-2)
-};
-
-}
-
-
-JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings)
-    : istr(istr_), header(header_), format_settings(format_settings), name_map(header.columns())
-{
-    /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
-    skipBOMIfExists(istr);
-
-    size_t num_columns = header.columns();
-    for (size_t i = 0; i < num_columns; ++i)
-    {
-        const String & colname = columnName(i);
-        name_map[colname] = i;        /// NOTE You could place names more cache-locally.
-        if (format_settings.import_nested_json)
-        {
-            const auto splitted = Nested::splitName(colname);
-            if (!splitted.second.empty())
-            {
-                const StringRef table_name(colname.data(), splitted.first.size());
-                name_map[table_name] = NESTED_FIELD;
-            }
-        }
-    }
-
-    prev_positions.assign(num_columns, name_map.end());
-}
-
-const String & JSONEachRowRowInputStream::columnName(size_t i) const
-{
-    return header.getByPosition(i).name;
-}
-
-inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, size_t key_index)
-{
-    /// Optimization by caching the order of fields (which is almost always the same)
-    /// and a quick check to match the next expected field, instead of searching the hash table.
-
-    if (prev_positions.size() > key_index
-        && prev_positions[key_index] != name_map.end()
-        && name == prev_positions[key_index]->getFirst())
-    {
-        return prev_positions[key_index]->getSecond();
-    }
-    else
-    {
-        const auto it = name_map.find(name);
-
-        if (name_map.end() != it)
-        {
-            if (key_index < prev_positions.size())
-                prev_positions[key_index] = it;
-
-            return it->getSecond();
-        }
-        else
-            return UNKNOWN_FIELD;
-    }
-}
-
-/** Read the field name and convert it to column name
-  *  (taking into account the current nested name prefix)
-  * Resulting StringRef is valid only before next read from buf.
-  */
-StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
-{
-    // This is just an optimization: try to avoid copying the name into current_column_name
-
-    if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
-    {
-        char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
-
-        if (next_pos != buf.buffer().end() && *next_pos != '\\')
-        {
-            /// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
-            assertChar('"', buf);
-            StringRef res(buf.position(), next_pos - buf.position());
-            buf.position() = next_pos + 1;
-            return res;
-        }
-    }
-
-    current_column_name.resize(nested_prefix_length);
-    readJSONStringInto(current_column_name, buf);
-    return current_column_name;
-}
-
-
-static inline void skipColonDelimeter(ReadBuffer & istr)
-{
-    skipWhitespaceIfAny(istr);
-    assertChar(':', istr);
-    skipWhitespaceIfAny(istr);
-}
-
-void JSONEachRowRowInputStream::skipUnknownField(const StringRef & name_ref)
-{
-    if (!format_settings.skip_unknown_fields)
-        throw Exception("Unknown field found while parsing JSONEachRow format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
-
-    skipJSONField(istr, name_ref);
-}
-
-void JSONEachRowRowInputStream::readField(size_t index, MutableColumns & columns)
-{
-    if (read_columns[index])
-        throw Exception("Duplicate field found while parsing JSONEachRow format: " + columnName(index), ErrorCodes::INCORRECT_DATA);
-
-    try
-    {
-        header.getByPosition(index).type->deserializeAsTextJSON(*columns[index], istr, format_settings);
-    }
-    catch (Exception & e)
-    {
-        e.addMessage("(while read the value of key " + columnName(index) + ")");
-        throw;
-    }
-
-    read_columns[index] = true;
-}
-
-inline bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
-{
-    skipWhitespaceIfAny(istr);
-
-    if (istr.eof())
-        throw Exception("Unexpected end of stream while parsing JSONEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA);
-    else if (*istr.position() == '}')
-    {
-        ++istr.position();
-        return false;
-    }
-
-    if (key_index > 0)
-    {
-        assertChar(',', istr);
-        skipWhitespaceIfAny(istr);
-    }
-    return true;
-}
-
-void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
-{
-    assertChar('{', istr);
-
-    for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
-    {
-        StringRef name_ref = readColumnName(istr);
-        const size_t column_index = columnIndex(name_ref, key_index);
-
-        if (unlikely(ssize_t(column_index) < 0))
-        {
-            /// name_ref may point directly to the input buffer
-            /// and input buffer may be filled with new data on next read
-            /// If we want to use name_ref after another reads from buffer, we must copy it to temporary string.
-
-            current_column_name.assign(name_ref.data, name_ref.size);
-            name_ref = StringRef(current_column_name);
-
-            skipColonDelimeter(istr);
-
-            if (column_index == UNKNOWN_FIELD)
-                skipUnknownField(name_ref);
-            else if (column_index == NESTED_FIELD)
-                readNestedData(name_ref.toString(), columns);
-            else
-                throw Exception("Logical error: illegal value of column_index", ErrorCodes::LOGICAL_ERROR);
-        }
-        else
-        {
-            skipColonDelimeter(istr);
-            readField(column_index, columns);
-        }
-    }
-}
-
-void JSONEachRowRowInputStream::readNestedData(const String & name, MutableColumns & columns)
-{
-    current_column_name = name;
-    current_column_name.push_back('.');
-    nested_prefix_length = current_column_name.size();
-    readJSONObject(columns);
-    nested_prefix_length = 0;
-}
-
-
-bool JSONEachRowRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
-{
-    skipWhitespaceIfAny(istr);
-
-    /// We consume ;, or \n before scanning a new row, instead scanning to next row at the end.
-    /// The reason is that if we want an exact number of rows read with LIMIT x
-    /// from a streaming table engine with text data format, like File or Kafka
-    /// then seeking to next ;, or \n would trigger reading of an extra row at the end.
-
-    /// Semicolon is added for convenience as it could be used at end of INSERT query.
-    if (!istr.eof() && (*istr.position() == ',' || *istr.position() == ';'))
-        ++istr.position();
-
-    skipWhitespaceIfAny(istr);
-    if (istr.eof())
-        return false;
-
-    size_t num_columns = columns.size();
-
-    /// Set of columns for which the values were read. The rest will be filled with default values.
-    read_columns.assign(num_columns, false);
-
-    nested_prefix_length = 0;
-    readJSONObject(columns);
-
-    /// Fill non-visited columns with the default values.
-    for (size_t i = 0; i < num_columns; ++i)
-        if (!read_columns[i])
-            header.getByPosition(i).type->insertDefaultInto(*columns[i]);
-
-    /// return info about defaults set
-    ext.read_columns = read_columns;
-    return true;
-}
-
-
-void JSONEachRowRowInputStream::syncAfterError()
-{
-    skipToUnescapedNextLineOrEOF(istr);
-}
-
-
-void registerInputFormatJSONEachRow(FormatFactory & factory)
-{
-    factory.registerInputFormat("JSONEachRow", [](
-        ReadBuffer & buf,
-        const Block & sample,
-        const Context &,
-        UInt64 max_block_size,
-        UInt64 rows_portion_size,
-        FormatFactory::ReadCallback callback,
-        const FormatSettings & settings)
-    {
-        return std::make_shared<BlockInputStreamFromRowInputStream>(
-            std::make_shared<JSONEachRowRowInputStream>(buf, sample, settings),
-            sample, max_block_size, rows_portion_size, callback, settings);
-    });
-}
-
-}
--- a/dbms/src/Formats/JSONEachRowRowInputStream.h
+++ b/dbms/src/Formats/JSONEachRowRowInputStream.h
@ -1,68 +0,0 @@
-#pragma once
-
-#include <Core/Block.h>
-#include <Formats/IRowInputStream.h>
-#include <Formats/FormatSettings.h>
-#include <Common/HashTable/HashMap.h>
-
-
-namespace DB
-{
-
-class ReadBuffer;
-
-
-/** A stream for reading data in JSON format, where each row is represented by a separate JSON object.
-  * Objects can be separated by line feed, other whitespace characters in any number and possibly a comma.
-  * Fields can be listed in any order (including, in different lines there may be different order),
-  *  and some fields may be missing.
-  */
-class JSONEachRowRowInputStream : public IRowInputStream
-{
-public:
-    JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings);
-
-    bool read(MutableColumns & columns, RowReadExtension & ext) override;
-    bool allowSyncAfterError() const override { return true; }
-    void syncAfterError() override;
-
-private:
-    const String & columnName(size_t i) const;
-    size_t columnIndex(const StringRef & name, size_t key_index);
-    bool advanceToNextKey(size_t key_index);
-    void skipUnknownField(const StringRef & name_ref);
-    StringRef readColumnName(ReadBuffer & buf);
-    void readField(size_t index, MutableColumns & columns);
-    void readJSONObject(MutableColumns & columns);
-    void readNestedData(const String & name, MutableColumns & columns);
-
-private:
-    ReadBuffer & istr;
-    Block header;
-
-    const FormatSettings format_settings;
-
-    /// Buffer for the read from the stream field name. Used when you have to copy it.
-    /// Also, if processing of Nested data is in progress, it holds the common prefix
-    /// of the nested column names (so that appending the field name to it produces
-    /// the full column name)
-    String current_column_name;
-
-    /// If processing Nested data, holds the length of the common prefix
-    /// of the names of related nested columns. For example, for a table
-    /// created as follows
-    ///        CREATE TABLE t (n Nested (i Int32, s String))
-    /// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.'
-    size_t nested_prefix_length = 0;
-
-    std::vector<UInt8> read_columns;
-
-    /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
-    using NameMap = HashMap<StringRef, size_t, StringRefHash>;
-    NameMap name_map;
-
-    /// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
-    std::vector<NameMap::iterator> prev_positions;
-};
-
-}
--- a/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
+++ b/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
@ -13,7 +13,7 @@ class ReadBuffer;


 /** A stream for reading data in JSON format, where each row is represented by a separate JSON object.
-  * Objects can be separated by feed return, other whitespace characters in any number and possibly a comma.
+  * Objects can be separated by line feed, other whitespace characters in any number and possibly a comma.
  * Fields can be listed in any order (including, in different lines there may be different order),
  *  and some fields may be missing.
  */