From 7b43608e1ff290073f1486dd773bde1164c13cf8 Mon Sep 17 00:00:00 2001 From: yandd <982724342@qq.com> Date: Wed, 24 Nov 2021 21:18:38 +0800 Subject: [PATCH 1/2] Fixed function ngrams --- .../functions/splitting-merging-functions.md | 2 +- .../functions/splitting-merging-functions.md | 2 +- src/Functions/tokenExtractors.cpp | 20 ++++++------------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index b8ec276c7f9..7a4e04bbf6c 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -290,7 +290,7 @@ ngrams(string, ngramsize) - Array with n-grams. -Type: [Array](../../sql-reference/data-types/array.md)([FixedString](../../sql-reference/data-types/fixedstring.md)). +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). **Example** diff --git a/docs/ru/sql-reference/functions/splitting-merging-functions.md b/docs/ru/sql-reference/functions/splitting-merging-functions.md index b3f91077dfa..2b3911b0c3e 100644 --- a/docs/ru/sql-reference/functions/splitting-merging-functions.md +++ b/docs/ru/sql-reference/functions/splitting-merging-functions.md @@ -252,7 +252,7 @@ ngrams(string, ngramsize) - Массив с n-граммами. -Тип: [Array](../../sql-reference/data-types/array.md)([FixedString](../../sql-reference/data-types/fixedstring.md)). +Тип: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). **Пример** diff --git a/src/Functions/tokenExtractors.cpp b/src/Functions/tokenExtractors.cpp index be232c3b742..5902cb5f3e5 100644 --- a/src/Functions/tokenExtractors.cpp +++ b/src/Functions/tokenExtractors.cpp @@ -68,17 +68,9 @@ public: "Function {} second argument type should be constant UInt. Actual {}", getName(), arguments[1].type->getName()); - - Field ngram_argument_value; - ngram_argument_column->get(0, ngram_argument_value); - auto ngram_value = ngram_argument_value.safeGet(); - - return std::make_shared(std::make_shared(ngram_value)); - } - else - { - return std::make_shared(std::make_shared()); } + + return std::make_shared(std::make_shared()); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override @@ -93,16 +85,16 @@ public: NgramTokenExtractor extractor(ngram_value); - auto result_column_fixed_string = ColumnFixedString::create(ngram_value); + auto result_column_string = ColumnString::create(); auto input_column = arguments[0].column; if (const auto * column_string = checkAndGetColumn(input_column.get())) - executeImpl(extractor, *column_string, *result_column_fixed_string, *column_offsets); + executeImpl(extractor, *column_string, *result_column_string, *column_offsets); else if (const auto * column_fixed_string = checkAndGetColumn(input_column.get())) - executeImpl(extractor, *column_fixed_string, *result_column_fixed_string, *column_offsets); + executeImpl(extractor, *column_fixed_string, *result_column_string, *column_offsets); - return ColumnArray::create(std::move(result_column_fixed_string), std::move(column_offsets)); + return ColumnArray::create(std::move(result_column_string), std::move(column_offsets)); } else { From f1914bb4232ca8717263369f8d80050405019d4b Mon Sep 17 00:00:00 2001 From: yandd <982724342@qq.com> Date: Thu, 25 Nov 2021 10:18:47 +0800 Subject: [PATCH 2/2] Added function ngrams test for utf8 characters --- .../0_stateless/02027_ngrams.reference | 28 ++++++++++++++++--- tests/queries/0_stateless/02027_ngrams.sql | 20 +++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02027_ngrams.reference b/tests/queries/0_stateless/02027_ngrams.reference index 4c6afcdf02c..b9b4cc1b098 100644 --- a/tests/queries/0_stateless/02027_ngrams.reference +++ b/tests/queries/0_stateless/02027_ngrams.reference @@ -3,18 +3,38 @@ ['Tes','est'] ['Test'] [] +['😁','😈','😁','😈'] +['😁😈','😈😁','😁😈'] +['😁😈😁','😈😁😈'] +['😁😈😁😈'] +[] ['T','e','s','t'] ['Te','es','st'] ['Tes','est'] ['Test'] [] -['T','e','s','t'] -['Te','es','st'] -['Tes','est'] -['Test'] +['😁','😈','😁','😈'] +['😁😈','😈😁','😁😈'] +['😁😈😁','😈😁😈'] +['😁😈😁😈'] [] ['T','e','s','t'] ['Te','es','st'] ['Tes','est'] ['Test'] [] +['😁','😈','😁','😈'] +['😁😈','😈😁','😁😈'] +['😁😈😁','😈😁😈'] +['😁😈😁😈'] +[] +['T','e','s','t'] +['Te','es','st'] +['Tes','est'] +['Test'] +[] +['😁','😈','😁','😈'] +['😁😈','😈😁','😁😈'] +['😁😈😁','😈😁😈'] +['😁😈😁😈'] +[] diff --git a/tests/queries/0_stateless/02027_ngrams.sql b/tests/queries/0_stateless/02027_ngrams.sql index b9ce36272d8..f7a6550e63c 100644 --- a/tests/queries/0_stateless/02027_ngrams.sql +++ b/tests/queries/0_stateless/02027_ngrams.sql @@ -3,21 +3,41 @@ SELECT ngrams('Test', 2); SELECT ngrams('Test', 3); SELECT ngrams('Test', 4); SELECT ngrams('Test', 5); +SELECT ngrams('😁😈😁😈', 1); +SELECT ngrams('😁😈😁😈', 2); +SELECT ngrams('😁😈😁😈', 3); +SELECT ngrams('😁😈😁😈', 4); +SELECT ngrams('😁😈😁😈', 5); SELECT ngrams(materialize('Test'), 1); SELECT ngrams(materialize('Test'), 2); SELECT ngrams(materialize('Test'), 3); SELECT ngrams(materialize('Test'), 4); SELECT ngrams(materialize('Test'), 5); +SELECT ngrams(materialize('😁😈😁😈'), 1); +SELECT ngrams(materialize('😁😈😁😈'), 2); +SELECT ngrams(materialize('😁😈😁😈'), 3); +SELECT ngrams(materialize('😁😈😁😈'), 4); +SELECT ngrams(materialize('😁😈😁😈'), 5); SELECT ngrams(toFixedString('Test', 4), 1); SELECT ngrams(toFixedString('Test', 4), 2); SELECT ngrams(toFixedString('Test', 4), 3); SELECT ngrams(toFixedString('Test', 4), 4); SELECT ngrams(toFixedString('Test', 4), 5); +SELECT ngrams(toFixedString('😁😈😁😈', 16), 1); +SELECT ngrams(toFixedString('😁😈😁😈', 16), 2); +SELECT ngrams(toFixedString('😁😈😁😈', 16), 3); +SELECT ngrams(toFixedString('😁😈😁😈', 16), 4); +SELECT ngrams(toFixedString('😁😈😁😈', 16), 5); SELECT ngrams(materialize(toFixedString('Test', 4)), 1); SELECT ngrams(materialize(toFixedString('Test', 4)), 2); SELECT ngrams(materialize(toFixedString('Test', 4)), 3); SELECT ngrams(materialize(toFixedString('Test', 4)), 4); SELECT ngrams(materialize(toFixedString('Test', 4)), 5); +SELECT ngrams(materialize(toFixedString('😁😈😁😈', 16)), 1); +SELECT ngrams(materialize(toFixedString('😁😈😁😈', 16)), 2); +SELECT ngrams(materialize(toFixedString('😁😈😁😈', 16)), 3); +SELECT ngrams(materialize(toFixedString('😁😈😁😈', 16)), 4); +SELECT ngrams(materialize(toFixedString('😁😈😁😈', 16)), 5); \ No newline at end of file