added read and parse impl

2024-12-04 13:32:13 +00:00 · 2023-10-13 15:16:07 +00:00 · 2023-10-13 15:16:07 +00:00 · cb08da617f
commit cb08da617f
parent 9ae025d7e6
2 changed files with 89 additions and 321 deletions
--- a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp
@ -1,8 +1,9 @@
 #include <IO/ReadHelpers.h>
 #include <iterator>
 #include <memory>
-#include <regex>
 #include <string>
+#include <vector>
+#include <type_traits>
 #include <unordered_map>
 #include <Processors/Formats/Impl/NpyRowInputFormat.h>
 #include <Formats/FormatFactory.h>
@ -13,8 +14,10 @@
 #include "Columns/IColumn.h"
 #include "Core/Field.h"
 #include "DataTypes/DataTypesNumber.h"
+#include "DataTypes/IDataType.h"
 #include "DataTypes/Serializations/ISerialization.h"
 #include "IO/ReadBuffer.h"
+#include "IO/WriteHelpers.h"
 #include "Processors/Formats/IRowInputFormat.h"
 #include "base/types.h"

@ -37,163 +40,111 @@ NpyRowInputFormat::NpyRowInputFormat(ReadBuffer & in_, Block header_, Params par
    const auto & sample_block = getPort().getHeader();
    size_t num_columns = sample_block.columns();
    for (size_t i = 0; i < num_columns; ++i)
-        name_map[sample_block.getByPosition(i).name] = i;        /// NOTE You could place names more cache-locally.
-}
-
-/** Read the field name in the `Npy` format.
-  * Return true if the field is followed by an equal sign,
-  *  otherwise (field with no value) return false.
-  * The reference to the field name will be written to `ref`.
-  * A temporary `tmp` buffer can also be used to copy the field name to it.
-  * When reading, skips the name and the equal sign after it.
-  */
-static bool readName(ReadBuffer & buf, StringRef & ref, String & tmp)
-{
-    tmp.clear();
-
-    while (!buf.eof())
-    {
-        const char * next_pos = find_first_symbols<'\t', '\n', '\\', '='>(buf.position(), buf.buffer().end());
-
-        if(*next_pos == '*')
-            break;
-
-        if (next_pos == buf.buffer().end())
-        {
-            tmp.append(buf.position(), next_pos - buf.position());
-            buf.position() = buf.buffer().end();
-            buf.next();
-            continue;
-        }
-
-        /// Came to the end of the name.
-        if (*next_pos != '\\')
-        {
-            bool have_value = *next_pos == '=';
-            if (tmp.empty())
-            {
-                /// No need to copy data, you can refer directly to the `buf`.
-                ref = StringRef(buf.position(), next_pos - buf.position());
-                buf.position() += next_pos + have_value - buf.position();
-            }
-            else
-            {
-                /// Copy the data to a temporary string and return a reference to it.
-                tmp.append(buf.position(), next_pos - buf.position());
-                buf.position() += next_pos + have_value - buf.position();
-                ref = StringRef(tmp);
-            }
-            return have_value;
-        }
-        /// The name has an escape sequence.
-        else
-        {
-            tmp.append(buf.position(), next_pos - buf.position());
-            buf.position() += next_pos + 1 - buf.position();
-            if (buf.eof())
-                throw Exception(ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE, "Cannot parse escape sequence");
-
-            tmp.push_back(parseEscapeSequence(*buf.position()));
-            ++buf.position();
-            continue;
-        }
-    }
-
-    throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream while reading key name from Npy format");
+        name_map[sample_block.getByPosition(i).name] = i;
 }

 template <typename T>
-T readItem(T &value, ReadBuffer &in)
+void readFromBuffer(ReadBuffer &in, MutableColumns &  /*columns*/, std::vector<int> shape)
 {
-    readBinaryLittleEndian(value, in);
-    return value;
+    while (*in.position() != '\n')
+        ++in.position();
+    ++in.position();
+    size_t total_size = 1;
+    for (int dim_size : shape)
+        total_size *= dim_size;
+
+    for (size_t i = 0; i < total_size; i++)
+    {
+        if (in.eof())
+        {
+            throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream in Npy format");
+        }
+        else if (*in.position() == '\t')
+        {
+            ++in.position();
+            continue;
+        }
+        else if (*in.position() == '\n')
+        {
+            ++in.position();
+            break;
        }
        
-void read(String type, ReadBuffer &in)
+        T value;
+        readBinaryLittleEndian(value, in);
+    }
+}
+
+template <typename T>
+void readStringFromBuffer(ReadBuffer &in, std::vector<int> shape)
+{
+    while (*in.position() != '\n')
+        ++in.position();
+    size_t total_size = 1;
+    for (int dim_size : shape)
+        total_size *= dim_size;
+
+    for (size_t i = 0; i < total_size; i++)
+    {
+        if (in.eof())
+        {
+            throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream in Npy format");
+        }
+        else if (*in.position() == '\t')
+        {
+            ++in.position();
+            continue;
+        }
+        // else if (*in.position() == '\n')
+        // {
+        //     ++in.position();
+        //     break;
+        // }
+
+        T value;
+        readStringBinary(value, in);
+        std::cout << value << std::endl;
+    }
+}
+
+void readAndParseType(String type, ReadBuffer &in, MutableColumns & columns, std::vector<int> shape) //is ok
 {
    if (type == "<i1")
-    {
-        Int8 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Int8>(in, columns, shape);
    else if (type == "<i2")
-    {
-        Int16 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Int16>(in, columns, shape);
    else if (type == "<i4")
-    {
-        Int32 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Int32>(in, columns, shape);
    else if (type == "<i8")
-    {
-        Int64 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Int64>(in, columns, shape);
    else if (type == "<u1")
-    {
-        UInt8 value;
-        readItem(value, in);
-    }
+        readFromBuffer<UInt8>(in, columns, shape);
    else if (type == "<u2")
-    {
-        UInt16 value;
-        readItem(value, in);
-    }
+        readFromBuffer<UInt16>(in, columns, shape);
    else if (type == "<u4")
-    {
-        UInt32 value;
-        readItem(value, in);
-    }
+        readFromBuffer<UInt32>(in, columns, shape);
    else if (type == "<u8")
-    {
-        UInt64 value;
-        readItem(value, in);
-    }
+        readFromBuffer<UInt64>(in, columns, shape);
    else if (type == "<f2")
-    {
-        Float32 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Float32>(in, columns, shape);
    else if (type == "<f4")
-    {
-        Float32 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Float32>(in, columns, shape);
    else if (type == "<f8")
-    {
-        Float64 value;
-        readItem(value, in);
-    }
+        readFromBuffer<Float64>(in, columns, shape);
    else if (type == "<c8" || type == "<c16")
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support complex numeric type");
    else if (type == "|b1")
-    {
-        Int8 value;
-        readItem(value, in);
-    }
-    else if (type == "<U10" || type == "<U20")
-    {
-        String value;
-        readItem(value, in);
-    }
+        readFromBuffer<Int8>(in, columns, shape);
+    else if (type == "<U10" || type == "<U20" || type == "<U21")
+        readStringFromBuffer<String>(in, shape);
    else if (type == "O")
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support object types");
    else
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support this type of data");
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while parsing data type");
 }

-bool NpyRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
+bool NpyRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &  /*ext*/)
 {
-    // while (!in.eof())
-    // {
-    //     [[maybe_unused]] auto *pos = in.position();
-    //     Float64 value_float;
-    //     readBinaryLittleEndian(value_float, in);
-    //     std::cout << "value: " << value_float << std::endl;
-    // }
-
    if (in->eof())
        return false;

@ -201,113 +152,22 @@ bool NpyRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
        ++in->position();
    ++in->position();

-    const auto & header_local = getPort().getHeader();
-    size_t num_columns = columns.size();
-
-    /// Set of columns for which the values were read. The rest will be filled with default values.
-    read_columns.assign(num_columns, false);
-    seen_columns.assign(num_columns, false);
-
    if (unlikely(*in->position() == '\n'))
    {
        /// An empty string. It is permissible, but it is unclear why.
        ++in->position();
    }
    else
-    {
-        while (true)
-        {
-            StringRef name_ref;
-            bool has_value = readName(*in, name_ref, name_buf);
-            ssize_t index = -1;
-
-            if (has_value)
-            {
-                /// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
-                /// and quickly checking for the next expected field, instead of searching the hash table.
-
-                auto * it = name_map.find(name_ref);
-                if (!it)
-                {
-                    if (!format_settings.skip_unknown_fields)
-                        throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown field found while parsing Npy format: {}", name_ref.toString());
-
-                    /// If the key is not found, skip the value.
-                    NullOutput sink;
-                    readEscapedStringInto(sink, *in);
-                }
-                else
-                {
-                    index = it->getMapped();
-
-                    if (seen_columns[index])
-                        throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing Npy format: {}", name_ref.toString());
-
-                    seen_columns[index] = read_columns[index] = true;
-                    const auto & type = getPort().getHeader().getByPosition(index).type;
-                    const auto & serialization = serializations[index];
-                    if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type))
-                        read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization);
-                    else
-                        serialization->deserializeTextEscaped(*columns[index], *in, format_settings);
-                }
-            }
-            else
-            {
-                /// The only thing that can go without value is `Npy` fragment that is ignored.
-                if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "npy", 4)))
-                    throw Exception(ErrorCodes::INCORRECT_DATA, "Found field without value while parsing Npy format: {}", name_ref.toString());
-            }
-
-            if (in->eof())
-            {
-                throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream after field in Npy format: {}", name_ref.toString());
-            }
-            else if (*in->position() == '\t')
-            {
-                ++in->position();
-                continue;
-            }
-            else if (*in->position() == '\n')
-            {
-                ++in->position();
-                break;
-            }
-            else
-            {
-                /// Possibly a garbage was written into column, remove it
-                if (index >= 0)
-                {
-                    columns[index]->popBack(1);
-                    seen_columns[index] = read_columns[index] = false;
-                }
-
-                throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Found garbage after field in Npy format: {}", name_ref.toString());
-            }
-        }
-    }
-
-    /// Fill in the not met columns with default values.
-    for (size_t i = 0; i < num_columns; ++i)
-        if (!seen_columns[i])
-            header_local.getByPosition(i).type->insertDefaultInto(*columns[i]);
-
-    /// return info about defaults set
-    if (format_settings.defaults_for_omitted_fields)
-        ext.read_columns = read_columns;
-    else
-        ext.read_columns.assign(num_columns, true);
+        readAndParseType(header["descr"], *in, columns, shape);

    return true;
 }

-
 void NpyRowInputFormat::syncAfterError()
 {
    skipToUnescapedNextLineOrEOF(*in);
 }

-
 void NpyRowInputFormat::resetParser()
 {
    IRowInputFormat::resetParser();
@ -347,43 +207,7 @@ NpySchemaReader::NpySchemaReader(ReadBuffer & in_, const FormatSettings & format
    return header;
 }

-DataTypePtr parseType(String type) //is ok
-{
-    if (type == "<i1")
-        return std::make_shared<DataTypeInt8>();
-    else if (type == "<i2")
-        return std::make_shared<DataTypeInt16>();
-    else if (type == "<i4")
-        return std::make_shared<DataTypeInt32>();
-    else if (type == "<i8")
-        return std::make_shared<DataTypeInt64>();
-    else if (type == "<u1")
-        return std::make_shared<DataTypeUInt8>();
-    else if (type == "<u2")
-        return std::make_shared<DataTypeUInt16>();
-    else if (type == "<u4")
-        return std::make_shared<DataTypeUInt32>();
-    else if (type == "<u8")
-        return std::make_shared<DataTypeUInt64>();
-    else if (type == "<f2")
-        return std::make_shared<DataTypeFloat32>();
-    else if (type == "<f4")
-        return std::make_shared<DataTypeFloat32>();
-    else if (type == "<f8")
-        return std::make_shared<DataTypeFloat64>();
-    else if (type == "<c8" || type == "<c16")
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support complex numeric type");
-    else if (type == "|b1")
-        return std::make_shared<DataTypeInt8>();
-    else if (type == "<U10" || type == "<U20")
-        return std::make_shared<DataTypeString>();
-    else if (type == "O")
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support object types");
-    else
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support this type of data");
-}
-
-Tuple parseShape(String shapeString)
+std::vector<int> parseShape(String shapeString)
 {
    shapeString.erase(std::remove(shapeString.begin(), shapeString.end(), '('), shapeString.end());
    shapeString.erase(std::remove(shapeString.begin(), shapeString.end(), ')'), shapeString.end());
@ -393,7 +217,7 @@ Tuple parseShape(String shapeString)
    int value;
    char comma; // to handle commas between values

-    Tuple shape;
+    std::vector<int> shape;

    while (ss >> value) {
        shape.push_back(value);
@ -402,7 +226,7 @@ Tuple parseShape(String shapeString)
    return shape;
 }

-void NpyRowInputFormat::readPrefix() //is ok
+void NpyRowInputFormat::readPrefix()
 {
    const char * begin_pos = find_first_symbols<'\''>(in->position(), in->buffer().end());
    String text(begin_pos);
@ -430,7 +254,6 @@ void NpyRowInputFormat::readPrefix() //is ok
        throw Exception(ErrorCodes::INCORRECT_DATA, "failed to find header keyword 'descr'");
    header_map["descr"] = (text.substr(loc1+9, loc2 - loc1 - 9));

-    data_type = parseType(header_map["descr"]);
    header = header_map;
    shape = parseShape(header_map["shape"]);
 }
@ -455,62 +278,7 @@ NamesAndTypesList NpySchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
        return {};
    }

-    NamesAndTypesList names_and_types;
-    String name_buf;
-    String value;
-    DataTypePtr data_type;
-    {
-        const char * begin_pos = find_first_symbols<'\''>(in.position(), in.buffer().end());
-        String text(begin_pos);
-        std::unordered_map<String, String> header_map;
-
-        // Finding fortran_order
-        size_t loc1 = text.find("fortran_order");
-        if (loc1 == std::string::npos)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "failed to find header keyword 'fortran_order'");
-        header_map["fortran_order"] = (text.substr(loc1+16, 4) == "True" ? "true" : "false");
-
-        // Finding shape
-        loc1 = text.find('(');
-        size_t loc2 = text.find(')');
-        if (loc1 == std::string::npos || loc2 == std::string::npos)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "failed to find header keyword '(' or ')'");
-        header_map["shape"] = text.substr(loc1, loc2 - loc1 + 1);
-
-        // Finding descr
-        loc1 = text.find("descr");
-        loc2 = loc1 + 9;
-        while (text[loc2] != '\'')
-            loc2++;
-        if (loc1 == std::string::npos)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "failed to find header keyword 'descr'");
-        header_map["descr"] = (text.substr(loc1+9, loc2 - loc1 - 9));
-
-        data_type = parseType(header_map["descr"]);
-        Tuple shape_tuple = parseShape(header_map["shape"]);
-        [[maybe_unused]] std::unordered_map<String, String> header = header_map;
-    }
-    while (*in.position() != '\n')
-    {
-        [[maybe_unused]] auto *pos = in.position();
-        in.ignore(1);
-    }
-    in.ignore(1);
-    while (!in.eof())
-    {
-        [[maybe_unused]] auto *pos = in.position();
-        Float64 value_float;
-        readBinaryLittleEndian(value_float, in);
-        std::cout << "value: " << value_float << std::endl;
-    }
-
-    const char * begin_pos = find_first_symbols<'\''>(in.position(), in.buffer().end());
-    String header(begin_pos);
-    std::unordered_map<String, String> header_map;
-
-    // Make getting headers without searching everytime
-
-    return names_and_types;
+    return {};
 }

 size_t nthSubstr(int n, const String& s,
@ -551,7 +319,7 @@ size_t nthSubstr(int n, const String& s,
 //     // NamesAndTypesList names_and_types;
 //     StringRef name_ref;
 //     String name_buf;
-//     readName(in, name_ref, name_buf);
+//     // readName(in, name_ref, name_buf);
 //     String text = String(name_ref);
 //     String res;

--- a/src/Processors/Formats/Impl/NpyRowInputFormat.h
+++ b/src/Processors/Formats/Impl/NpyRowInputFormat.h
@ -56,7 +56,7 @@ private:

    std::unordered_map<String, String> header;
    DataTypePtr data_type;
-    Tuple shape;
+    std::vector<int> shape;

    /// Set of columns for which the values were read. The rest will be filled with default values.
    std::vector<UInt8> read_columns;