diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index e98f19b2a65..54b3a8fa99d 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -74,6 +74,7 @@ The supported formats are: | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | | [ORC](#data-format-orc) | ✔ | ✔ | | [One](#data-format-one) | ✔ | ✗ | +| [Npy](#data-format-npy) | ✔ | ✗ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | @@ -2445,6 +2446,26 @@ Result: └──────────────┘ ``` +## Npy {#data-format-npy} + +This function is designed to load a NumPy array from a .npy file into ClickHouse. The NumPy file format is a binary format used for efficiently storing arrays of numerical data. It stores all the top-level dimension objects as a separate column. + +**Example** + +Query: +```sql +SELECT * +FROM file('example_array.npy', Npy) +``` + +Result: +``` +┌─array─────────┐ +│ [[1],[2],[3]] │ +│ [[4],[5],[6]] │ +└───────────────┘ +``` + ## LineAsString {#lineasstring} In this format, every line of input data is interpreted as a single string value. This format can only be parsed for table with a single field of type [String](/docs/en/sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](/docs/en/sql-reference/statements/create/table.md/#default) or [MATERIALIZED](/docs/en/sql-reference/statements/create/table.md/#materialized), or omitted. diff --git a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp index 45e456ca526..c5a04e31fa9 100644 --- a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp @@ -37,9 +37,8 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; - extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; - extern const int CANNOT_READ_ALL_DATA; - extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } @@ -119,16 +118,21 @@ std::vector parseShape(String shapeString) shapeString.erase(std::remove(shapeString.begin(), shapeString.end(), ')'), shapeString.end()); // Use a string stream to extract integers - std::istringstream ss(shapeString); - int value; - char comma; // to handle commas between values + String value; std::vector shape; - while (ss >> value) { - shape.push_back(value); - ss >> comma; // read the comma + size_t start = 0, end = 0; + + while ((end = shapeString.find(',', start)) != std::string::npos) + { + shape.push_back(std::stoi(shapeString.substr(start, end - start))); + start = end + 1; } + + // Add the last token (or the only token if no delimiter is found) + if (start != shapeString.length()) + shape.push_back(std::stoi(shapeString.substr(start))); return shape; } diff --git a/tests/queries/0_stateless/02895_npy_format.reference b/tests/queries/0_stateless/02895_npy_format.reference new file mode 100644 index 00000000000..3813c3b2fcf --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_format.reference @@ -0,0 +1,28 @@ +1 +2 +3 +1.1 +2.2 +3.3 +1 +a +c +1 +a +c +[1,2,3] +[4,5,6] +[1.1,2.22,3.33] +[4.4,5.5,6.6] +['a','b','c'] +['e','f','g'] +['a','b','c'] +['e','f','g'] +[1] +[0] +[1] +[0] +[0,0,0] +[0,0,0] +[[1,2],[3,4]] +[[5,6],[7,8]] diff --git a/tests/queries/0_stateless/02895_npy_format.sh b/tests/queries/0_stateless/02895_npy_format.sh new file mode 100755 index 00000000000..644c42cb332 --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_format.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_str.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_unicode.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_float.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_str.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_unicode.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_bool.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_null.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/three_dim.npy')" diff --git a/tests/queries/0_stateless/data_npy/one_dim.npy b/tests/queries/0_stateless/data_npy/one_dim.npy new file mode 100644 index 00000000000..80c6ff86eba Binary files /dev/null and b/tests/queries/0_stateless/data_npy/one_dim.npy differ diff --git a/tests/queries/0_stateless/data_npy/one_dim_float.npy b/tests/queries/0_stateless/data_npy/one_dim_float.npy new file mode 100644 index 00000000000..0e7fc14e59e Binary files /dev/null and b/tests/queries/0_stateless/data_npy/one_dim_float.npy differ diff --git a/tests/queries/0_stateless/data_npy/one_dim_str.npy b/tests/queries/0_stateless/data_npy/one_dim_str.npy new file mode 100644 index 00000000000..010cb52787e Binary files /dev/null and b/tests/queries/0_stateless/data_npy/one_dim_str.npy differ diff --git a/tests/queries/0_stateless/data_npy/one_dim_unicode.npy b/tests/queries/0_stateless/data_npy/one_dim_unicode.npy new file mode 100644 index 00000000000..d0245df237c Binary files /dev/null and b/tests/queries/0_stateless/data_npy/one_dim_unicode.npy differ diff --git a/tests/queries/0_stateless/data_npy/three_dim.npy b/tests/queries/0_stateless/data_npy/three_dim.npy new file mode 100644 index 00000000000..07a6b2bec4a Binary files /dev/null and b/tests/queries/0_stateless/data_npy/three_dim.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim.npy b/tests/queries/0_stateless/data_npy/two_dim.npy new file mode 100644 index 00000000000..70f4ed9ec17 Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim_bool.npy b/tests/queries/0_stateless/data_npy/two_dim_bool.npy new file mode 100644 index 00000000000..d2f28ce6fa3 Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim_bool.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim_float.npy b/tests/queries/0_stateless/data_npy/two_dim_float.npy new file mode 100644 index 00000000000..2e2b58513cd Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim_float.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim_null.npy b/tests/queries/0_stateless/data_npy/two_dim_null.npy new file mode 100644 index 00000000000..b78b95af03e Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim_null.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim_str.npy b/tests/queries/0_stateless/data_npy/two_dim_str.npy new file mode 100644 index 00000000000..bda150c5f46 Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim_str.npy differ diff --git a/tests/queries/0_stateless/data_npy/two_dim_unicode.npy b/tests/queries/0_stateless/data_npy/two_dim_unicode.npy new file mode 100644 index 00000000000..f8558d810c5 Binary files /dev/null and b/tests/queries/0_stateless/data_npy/two_dim_unicode.npy differ