From 2ab1ae42c10cf1797dc0fd753957dd1129926f87 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 25 Oct 2023 10:37:05 +0000 Subject: [PATCH] added docs and tests, style check --- docs/en/interfaces/formats.md | 21 +++++++++++++ .../Formats/Impl/NpyRowInputFormat.cpp | 22 ++++++++------ .../0_stateless/02895_npy_format.reference | 28 ++++++++++++++++++ tests/queries/0_stateless/02895_npy_format.sh | 18 +++++++++++ .../queries/0_stateless/data_npy/one_dim.npy | Bin 0 -> 152 bytes .../0_stateless/data_npy/one_dim_float.npy | Bin 0 -> 152 bytes .../0_stateless/data_npy/one_dim_str.npy | Bin 0 -> 131 bytes .../0_stateless/data_npy/one_dim_unicode.npy | Bin 0 -> 140 bytes .../0_stateless/data_npy/three_dim.npy | Bin 0 -> 192 bytes .../queries/0_stateless/data_npy/two_dim.npy | Bin 0 -> 176 bytes .../0_stateless/data_npy/two_dim_bool.npy | Bin 0 -> 132 bytes .../0_stateless/data_npy/two_dim_float.npy | Bin 0 -> 176 bytes .../0_stateless/data_npy/two_dim_null.npy | Bin 0 -> 176 bytes .../0_stateless/data_npy/two_dim_str.npy | Bin 0 -> 134 bytes .../0_stateless/data_npy/two_dim_unicode.npy | Bin 0 -> 152 bytes 15 files changed, 80 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02895_npy_format.reference create mode 100755 tests/queries/0_stateless/02895_npy_format.sh create mode 100644 tests/queries/0_stateless/data_npy/one_dim.npy create mode 100644 tests/queries/0_stateless/data_npy/one_dim_float.npy create mode 100644 tests/queries/0_stateless/data_npy/one_dim_str.npy create mode 100644 tests/queries/0_stateless/data_npy/one_dim_unicode.npy create mode 100644 tests/queries/0_stateless/data_npy/three_dim.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim_bool.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim_float.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim_null.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim_str.npy create mode 100644 tests/queries/0_stateless/data_npy/two_dim_unicode.npy diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index e98f19b2a65..54b3a8fa99d 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -74,6 +74,7 @@ The supported formats are: | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | | [ORC](#data-format-orc) | ✔ | ✔ | | [One](#data-format-one) | ✔ | ✗ | +| [Npy](#data-format-npy) | ✔ | ✗ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | @@ -2445,6 +2446,26 @@ Result: └──────────────┘ ``` +## Npy {#data-format-npy} + +This function is designed to load a NumPy array from a .npy file into ClickHouse. The NumPy file format is a binary format used for efficiently storing arrays of numerical data. It stores all the top-level dimension objects as a separate column. + +**Example** + +Query: +```sql +SELECT * +FROM file('example_array.npy', Npy) +``` + +Result: +``` +┌─array─────────┐ +│ [[1],[2],[3]] │ +│ [[4],[5],[6]] │ +└───────────────┘ +``` + ## LineAsString {#lineasstring} In this format, every line of input data is interpreted as a single string value. This format can only be parsed for table with a single field of type [String](/docs/en/sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](/docs/en/sql-reference/statements/create/table.md/#default) or [MATERIALIZED](/docs/en/sql-reference/statements/create/table.md/#materialized), or omitted. diff --git a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp index 45e456ca526..c5a04e31fa9 100644 --- a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp @@ -37,9 +37,8 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; - extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; - extern const int CANNOT_READ_ALL_DATA; - extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } @@ -119,16 +118,21 @@ std::vector parseShape(String shapeString) shapeString.erase(std::remove(shapeString.begin(), shapeString.end(), ')'), shapeString.end()); // Use a string stream to extract integers - std::istringstream ss(shapeString); - int value; - char comma; // to handle commas between values + String value; std::vector shape; - while (ss >> value) { - shape.push_back(value); - ss >> comma; // read the comma + size_t start = 0, end = 0; + + while ((end = shapeString.find(',', start)) != std::string::npos) + { + shape.push_back(std::stoi(shapeString.substr(start, end - start))); + start = end + 1; } + + // Add the last token (or the only token if no delimiter is found) + if (start != shapeString.length()) + shape.push_back(std::stoi(shapeString.substr(start))); return shape; } diff --git a/tests/queries/0_stateless/02895_npy_format.reference b/tests/queries/0_stateless/02895_npy_format.reference new file mode 100644 index 00000000000..3813c3b2fcf --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_format.reference @@ -0,0 +1,28 @@ +1 +2 +3 +1.1 +2.2 +3.3 +1 +a +c +1 +a +c +[1,2,3] +[4,5,6] +[1.1,2.22,3.33] +[4.4,5.5,6.6] +['a','b','c'] +['e','f','g'] +['a','b','c'] +['e','f','g'] +[1] +[0] +[1] +[0] +[0,0,0] +[0,0,0] +[[1,2],[3,4]] +[[5,6],[7,8]] diff --git a/tests/queries/0_stateless/02895_npy_format.sh b/tests/queries/0_stateless/02895_npy_format.sh new file mode 100755 index 00000000000..644c42cb332 --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_format.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_str.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_unicode.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_float.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_str.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_unicode.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_bool.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_null.npy')" +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/three_dim.npy')" diff --git a/tests/queries/0_stateless/data_npy/one_dim.npy b/tests/queries/0_stateless/data_npy/one_dim.npy new file mode 100644 index 0000000000000000000000000000000000000000..80c6ff86eba9352a6bb1b027bcde26b38dc749fc GIT binary patch literal 152 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC!@qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= eXCxM+0{I%oI+{8PwF(pfE=C3jV1m-jP#OT|KpSTO literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/one_dim_float.npy b/tests/queries/0_stateless/data_npy/one_dim_float.npy new file mode 100644 index 0000000000000000000000000000000000000000..0e7fc14e59e069e932be5b7fa14d37896493396a GIT binary patch literal 152 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= jXCxM+0{I%oI+{8PwF(pfu30ld;G;c+W^_mc11<*uhzKUD literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/one_dim_str.npy b/tests/queries/0_stateless/data_npy/one_dim_str.npy new file mode 100644 index 0000000000000000000000000000000000000000..010cb52787e8012c7d29beffdeebec8b3d8ba7f2 GIT binary patch literal 131 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1J6KtriqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= YXCxM+0{I%oI+{8PwF(pfF2lrR0Jy^(c>n+a literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/one_dim_unicode.npy b/tests/queries/0_stateless/data_npy/one_dim_unicode.npy new file mode 100644 index 0000000000000000000000000000000000000000..d0245df237cb5e322703e95ca86fbec89eb695f9 GIT binary patch literal 140 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1Z6KbfgqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= eXCxM+0{I%oI+{8PwF(pfE<*+ehD0Dv24Vo1${afY literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/three_dim.npy b/tests/queries/0_stateless/data_npy/three_dim.npy new file mode 100644 index 0000000000000000000000000000000000000000..07a6b2bec4abac9c6658ca408675f4db224be576 GIT binary patch literal 192 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC!@qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= uXCxM+0{I$7Itn19siRP``ItsN4WCJcn1_)q+(#%kr1xm9*X*MVg0D$frhyVZp literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/two_dim_bool.npy b/tests/queries/0_stateless/data_npy/two_dim_bool.npy new file mode 100644 index 0000000000000000000000000000000000000000..d2f28ce6fa37c13f521c330fb82f19614761c336 GIT binary patch literal 132 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JlVqr_qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= bXCxM+0{I#yItqrGItsN4WCJcn21W(|-0mBv literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/two_dim_float.npy b/tests/queries/0_stateless/data_npy/two_dim_float.npy new file mode 100644 index 0000000000000000000000000000000000000000..2e2b58513cd908f489caa50d51b10e6ac13e381a GIT binary patch literal 176 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$7Its>`ItsN4WCN~QGeO{^{o$_~G5v=a9hMZ>a$R4{EbYK7jF^4oT Hka7S3o-HmQ literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/two_dim_null.npy b/tests/queries/0_stateless/data_npy/two_dim_null.npy new file mode 100644 index 0000000000000000000000000000000000000000..b78b95af03ea608ca2d61fc3e30b8171de297a4f GIT binary patch literal 176 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC!@qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= aXCxM+0{I$7Its>`ItsN4WCJb+5&-~zE*o`ItsN4WCO0mq~z4JbO05}9qs@C literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_npy/two_dim_unicode.npy b/tests/queries/0_stateless/data_npy/two_dim_unicode.npy new file mode 100644 index 0000000000000000000000000000000000000000..f8558d810c5766caa8c931434403d833114b0bf1 GIT binary patch literal 152 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1Z6KbfgqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= oXCxM+0{I$7Its>`ItsN4WCN~51_p*CAWjD2R3J_R;&dPe0RI9VumAu6 literal 0 HcmV?d00001