diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index 05ffc1df56f..01b75039963 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -35,8 +35,9 @@ jobs: GID=$(id -g "${UID}") docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \ --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \ - /ClickHouse/utils/changelog/changelog.py -vv --gh-user-or-token="$GITHUB_TOKEN" \ - --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" --jobs=5 "${GITHUB_TAG}" + /ClickHouse/utils/changelog/changelog.py -v --debug-helpers \ + --gh-user-or-token="$GITHUB_TOKEN" --jobs=5 \ + --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" "${GITHUB_TAG}" git add "./docs/changelogs/${GITHUB_TAG}.md" git diff HEAD - name: Create Pull Request diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 6b01ee31501..6597cd2d6bf 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1822,10 +1822,13 @@ Result: Evaluate external model. Accepts a model name and model arguments. Returns Float64. -## throwIf(x\[, custom_message\]) +## throwIf(x\[, message\[, error_code\]\]) Throw an exception if the argument is non zero. -custom_message - is an optional parameter: a constant string, provides an error message +`message` - is an optional parameter: a constant string providing a custom error message +`error_code` - is an optional parameter: a constant integer providing a custom error code + +To use the `error_code` argument, configuration parameter `allow_custom_error_code_in_throwif` must be enabled. ``` sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 30343c85c5a..bfe0d3d463b 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -1727,10 +1727,13 @@ SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS j Принимает на вход имя и аргументы модели. Возвращает Float64. -## throwIf(x\[, custom_message\]) {#throwifx-custom-message} +## throwIf(x\[, message\[, error_code\]\]) {#throwifx-custom-message} Бросает исключение, если аргумент не равен нулю. -custom_message - необязательный параметр, константная строка, задает текст сообщения об ошибке. +`custom_message` - необязательный параметр, константная строка, задает текст сообщения об ошибке. +`error_code` - необязательный параметр, константное число, задает код ошибки. + +Чтобы использовать аргумент `error_code`, должен быть включен параметр конфигурации `allow_custom_error_code_in_throwif`. ``` sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a905595b77c..75062ce9c94 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -408,6 +408,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, low_cardinality_max_dictionary_size, 8192, "Maximum size (in rows) of shared global dictionary for LowCardinality type.", 0) \ M(Bool, low_cardinality_use_single_dictionary_for_part, false, "LowCardinality type serialization setting. If is true, than will use additional keys when global dictionary overflows. Otherwise, will create several shared dictionaries.", 0) \ M(Bool, decimal_check_overflow, true, "Check overflow of decimal arithmetic/comparison operations", 0) \ + M(Bool, allow_custom_error_code_in_throwif, false, "Enable custom error code in function throwIf(). If true, thrown exceptions may have unexpected error codes.", 0) \ \ M(Bool, prefer_localhost_replica, true, "If it's true then queries will be always sent to local replica (if it exists). If it's false then replica to send a query will be chosen between local and remote ones according to load_balancing", 0) \ M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \ @@ -713,7 +714,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \ M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \ M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ - M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ + M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \ diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index fbfc9e9bc1f..c0c000452fc 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -611,7 +611,7 @@ public: ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); if (!isNativeNumber(arguments[1].type)) - throw Exception("Second argument for function " + getName() + " (delta) must be number", + throw Exception("Second argument for function " + getName() + " (delta) must be a number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); if (arguments.size() == 2) @@ -627,7 +627,7 @@ public: { throw Exception( "Function " + getName() + " supports 2 or 3 arguments. The 1st argument " - "must be of type Date or DateTime. The 2nd argument must be number. " + "must be of type Date or DateTime. The 2nd argument must be a number. " "The 3rd argument (optional) must be " "a constant string with timezone name. The timezone argument is allowed " "only when the 1st argument has the type DateTime", diff --git a/src/Functions/FunctionNumericPredicate.h b/src/Functions/FunctionNumericPredicate.h index 9d98f0c929d..496564b0b55 100644 --- a/src/Functions/FunctionNumericPredicate.h +++ b/src/Functions/FunctionNumericPredicate.h @@ -46,7 +46,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isNativeNumber(arguments.front())) - throw Exception{"Argument for function " + getName() + " must be number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception{"Argument for function " + getName() + " must be a number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; return std::make_shared(); } diff --git a/src/Functions/makeDate.cpp b/src/Functions/makeDate.cpp index 5b04ed05a3c..34c681736f0 100644 --- a/src/Functions/makeDate.cpp +++ b/src/Functions/makeDate.cpp @@ -62,7 +62,7 @@ protected: DataTypePtr argument_type = arguments[i].type; if (!isNumber(argument_type)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Argument '{}' for function {} must be number", std::string(argument_names[i]), getName()); + "Argument '{}' for function {} must be a number", std::string(argument_names[i]), getName()); } } @@ -322,7 +322,7 @@ public: const auto& fraction_argument = arguments[argument_names.size()]; if (!isNumber(fraction_argument.type)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Argument 'fraction' for function {} must be number", getName()); + "Argument 'fraction' for function {} must be a number", getName()); } /// Optional precision argument diff --git a/src/Functions/throwIf.cpp b/src/Functions/throwIf.cpp index bda8426150e..692faf1883c 100644 --- a/src/Functions/throwIf.cpp +++ b/src/Functions/throwIf.cpp @@ -4,9 +4,10 @@ #include #include #include +#include #include #include - +#include namespace DB { @@ -21,55 +22,56 @@ namespace ErrorCodes namespace { +/// The regex-based code style check script in CI complains when it sees "ErrorCodes:: ErrorCode" (space added to avoid another match). +/// Because this expression is only used in this file, don't add some suppression mechanism to the already complex style checker, instead +/// work around by creating a namespace alias. +namespace ErrorCodeAlias = ErrorCodes; + /// Throw an exception if the argument is non zero. class FunctionThrowIf : public IFunction { public: static constexpr auto name = "throwIf"; - static FunctionPtr create(ContextPtr) - { - return std::make_shared(); - } - String getName() const override - { - return name; - } + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionThrowIf(ContextPtr context_) : allow_custom_error_code_argument(context_->getSettingsRef().allow_custom_error_code_in_throwif) {} + String getName() const override { return name; } bool isVariadic() const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - size_t getNumberOfArguments() const override - { - return 0; - } + size_t getNumberOfArguments() const override { return 0; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { const size_t number_of_arguments = arguments.size(); - if (number_of_arguments < 1 || number_of_arguments > 2) + if (number_of_arguments < 1 || number_of_arguments > (allow_custom_error_code_argument ? 3 : 2)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2", - getName(), - toString(number_of_arguments)); + "Number of arguments for function {} doesn't match: passed {}, should be {}", + getName(), toString(number_of_arguments), allow_custom_error_code_argument ? "1 or 2 or 3" : "1 or 2"); if (!isNativeNumber(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Argument for function {} must be number", - getName()); + "First argument of function {} must be a number (passed: {})", getName(), arguments[0]->getName()); if (number_of_arguments > 1 && !isString(arguments[1])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument of function {}", - arguments[1]->getName(), - getName()); + "Second argument of function {} must be a string (passed: {})", getName(), arguments[1]->getName()); + + if (allow_custom_error_code_argument && number_of_arguments > 2) + { + WhichDataType which(arguments[2]); + if (!(which.isInt8() || which.isInt16() || which.isInt32())) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Third argument of function {} must be Int8, Int16 or Int32 (passed: {})", getName(), arguments[2]->getName()); + } return std::make_shared(); } bool useDefaultImplementationForConstants() const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } /** Prevent constant folding for FunctionThrowIf because for short circuit evaluation * it is unsafe to evaluate this function during DAG analysis. @@ -86,36 +88,44 @@ public: { const auto * message_column = checkAndGetColumnConst(arguments[1].column.get()); if (!message_column) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Second argument for function {} must be constant String", - getName()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant String", getName()); custom_message = message_column->getValue(); } + std::optional custom_error_code; + if (allow_custom_error_code_argument && arguments.size() == 3) + { + if (!isColumnConst(*(arguments[2].column))) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Third argument for function {} must be constant number", getName()); + + custom_error_code = arguments[2].column->getInt(0); + } + auto first_argument_column = arguments.front().column; const auto * in = first_argument_column.get(); ColumnPtr res; - if (!((res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)) - || (res = execute(in, custom_message)))) + if (!((res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)) + || (res = execute(in, custom_message, custom_error_code)))) { - throw Exception{"Illegal column " + in->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", in->getName(), getName()); } return res; } +private: template - ColumnPtr execute(const IColumn * in_untyped, const std::optional & message) const + ColumnPtr execute(const IColumn * in_untyped, const std::optional & message, const std::optional & error_code) const { const auto * in = checkAndGetColumn>(in_untyped); @@ -127,8 +137,9 @@ public: const auto & in_data = in->getData(); if (!memoryIsZero(in_data.data(), 0, in_data.size() * sizeof(in_data[0]))) { - throw Exception(ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO, - message.value_or("Value passed to '" + getName() + "' function is non zero")); + throw Exception( + error_code.value_or(ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO), + message.value_or("Value passed to '" + getName() + "' function is non-zero")); } size_t result_size = in_untyped->size(); @@ -139,6 +150,8 @@ public: return nullptr; } + + bool allow_custom_error_code_argument; }; } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index a6ca5bcf916..23b07116df3 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -70,6 +70,7 @@ namespace ErrorCodes extern const int THERE_IS_NO_COLUMN; extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_NUMBER_OF_COLUMNS; + extern const int INCORRECT_DATA; } /// Inserts numeric data right into internal column data to reduce an overhead @@ -266,6 +267,9 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column) { + if (!arrow_column->null_count()) + return ColumnUInt8::create(arrow_column->length(), 0); + auto nullmap_column = ColumnUInt8::create(); PaddedPODArray & bytemap_data = assert_cast &>(*nullmap_column).getData(); bytemap_data.reserve(arrow_column->length()); @@ -298,14 +302,121 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr & arrow_column) +/* + * Arrow Dictionary and ClickHouse LowCardinality types are a bit different. + * Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index. + * It doesn't have the concept of null or default values. + * An empty string is just a regular value appended at any position of the dictionary. + * Null values have an index of 0, but it should be ignored since the nullmap will return null. + * In ClickHouse LowCardinality, it's different. The dictionary contains null (if dictionary type is Nullable) + * and default values at the beginning. [default, ...] when default values have index of 0 or [null, default, ...] + * when null values have an index of 0 and default values have an index of 1. + * So, we should remap indexes while converting Arrow Dictionary to ClickHouse LowCardinality + * */ +template > +static ColumnWithTypeAndName readColumnWithIndexesDataImpl(std::shared_ptr & arrow_column, const String & column_name, Int64 default_value_index, NumericType dict_size, bool is_nullable) +{ + auto internal_type = std::make_shared>(); + auto internal_column = internal_type->createColumn(); + auto & column_data = static_cast(*internal_column).getData(); + column_data.reserve(arrow_column->length()); + NumericType shift = is_nullable ? 2 : 1; + + for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) + { + std::shared_ptr chunk = arrow_column->chunk(chunk_i); + if (chunk->length() == 0) + continue; + + /// buffers[0] is a null bitmap and buffers[1] are actual values + std::shared_ptr buffer = chunk->data()->buffers[1]; + const auto * data = reinterpret_cast(buffer->data()); + + /// Check that indexes are correct (protection against corrupted files) + for (int64_t i = 0; i != chunk->length(); ++i) + { + if (data[i] < 0 || data[i] >= dict_size) + throw Exception(ErrorCodes::INCORRECT_DATA, "Index {} in Dictionary column is out of bounds, dictionary size is {}", Int64(data[i]), UInt64(dict_size)); + } + + /// If dictionary type is not nullable and arrow dictionary contains default type + /// at 0 index, we don't need to remap anything (it's the case when this data + /// was generated by ClickHouse) + if (!is_nullable && default_value_index == 0) + { + column_data.insert_assume_reserved(data, data + chunk->length()); + } + /// If dictionary don't contain default value, we should move all indexes + /// to the right one or two (if dictionary is Nullable) positions + /// Example: + /// Dictionary: + /// dict: ["one", "two"] + /// indexes: [0, 1, 0] + /// LowCardinality: + /// dict: ["", "one", "two"] + /// indexes: [1, 2, 1] + /// LowCardinality(Nullable): + /// dict: [null, "", "one", "two"] + /// indexes: [2, 3, 2] + else if (default_value_index == -1) + { + for (int64_t i = 0; i != chunk->length(); ++i) + { + if (chunk->IsNull(i)) + column_data.push_back(0); + else + column_data.push_back(data[i] + shift); + } + } + /// If dictionary contains default value, we change all indexes of it to + /// 0 or 1 (if dictionary type is Nullable) and move all indexes + /// that are less then default value index to the right one or two + /// (if dictionary is Nullable) position and all indexes that are + /// greater then default value index zero or one (if dictionary is Nullable) + /// positions. + /// Example: + /// Dictionary: + /// dict: ["one", "two", "", "three"] + /// indexes: [0, 1, 2, 3, 0] + /// LowCardinality : + /// dict: ["", "one", "two", "three"] + /// indexes: [1, 2, 0, 3, 1] + /// LowCardinality(Nullable): + /// dict: [null, "", "one", "two", "three"] + /// indexes: [2, 3, 1, 4, 2] + else + { + NumericType new_default_index = is_nullable ? 1 : 0; + NumericType default_index = NumericType(default_value_index); + for (int64_t i = 0; i != chunk->length(); ++i) + { + if (chunk->IsNull(i)) + column_data.push_back(0); + else + { + NumericType value = data[i]; + if (value == default_index) + value = new_default_index; + else if (value < default_index) + value += shift; + else + value += shift - 1; + column_data.push_back(value); + } + } + } + } + return {std::move(internal_column), std::move(internal_type), column_name}; +} + +static ColumnPtr readColumnWithIndexesData(std::shared_ptr & arrow_column, Int64 default_value_index, UInt64 dict_size, bool is_nullable) { switch (arrow_column->type()->id()) { # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ { \ - return readColumnWithNumericData(arrow_column, "").column; \ + return readColumnWithIndexesDataImpl(arrow_column, "", default_value_index, dict_size, is_nullable).column; \ } FOR_ARROW_INDEXES_TYPES(DISPATCH) # undef DISPATCH @@ -327,85 +438,25 @@ static std::shared_ptr getNestedArrowColumn(std::shared_ptr return std::make_shared(array_vector); } -static ColumnWithTypeAndName createLCColumnFromArrowDictionaryValues( - const std::shared_ptr & dict_values, - const ColumnPtr & indexes_column, - const String & column_name -) -{ - auto lc_type = std::make_shared(dict_values->type); - - auto lc_column = lc_type->createColumn(); - - for (auto i = 0u; i < indexes_column->size(); i++) - { - Field f; - dict_values->column->get(indexes_column->getUInt(i), f); - lc_column->insert(f); - } - - return {std::move(lc_column), std::move(lc_type), column_name}; -} - -/* - * Dictionary(Nullable(X)) in ArrowColumn format is composed of a nullmap, dictionary and an index. - * It doesn't have the concept of null or default values. - * An empty string is just a regular value appended at any position of the dictionary. - * Null values have an index of 0, but it should be ignored since the nullmap will return null. - * In ClickHouse LowCardinality, it's different. The dictionary contains null and default values at the beginning. - * [null, default, ...]. Therefore, null values have an index of 0 and default values have an index of 1. - * No nullmap is used. - * */ -static ColumnWithTypeAndName createLCOfNullableColumnFromArrowDictionaryValues( - const std::shared_ptr & dict_values, - const ColumnPtr & indexes_column, - const ColumnPtr & nullmap_column, - const String & column_name -) -{ - /* - * ArrowColumn format handles nulls by maintaining a nullmap column, there is no nullable type. - * Therefore, dict_values->type is the actual data type/ non-nullable. It needs to be transformed into nullable - * so LC column is created from nullable type and a null value at the beginning of the collection - * is automatically added. - * */ - auto lc_type = std::make_shared(makeNullable(dict_values->type)); - - auto lc_column = lc_type->createColumn(); - - for (auto i = 0u; i < indexes_column->size(); i++) - { - if (nullmap_column && nullmap_column->getBool(i)) - { - lc_column->insertDefault(); - } - else - { - Field f; - dict_values->column->get(indexes_column->getUInt(i), f); - lc_column->insert(f); - } - } - - return {std::move(lc_column), std::move(lc_type), column_name}; -} - static ColumnWithTypeAndName readColumnFromArrowColumn( std::shared_ptr & arrow_column, const std::string & column_name, const std::string & format_name, bool is_nullable, - std::unordered_map> & dictionary_values, - bool read_ints_as_dates, + std::unordered_map & dictionary_infos, bool allow_null_type, bool skip_columns_with_unsupported_types, - bool & skipped) + bool & skipped, + DataTypePtr type_hint = nullptr) { - if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST + if (!is_nullable && (arrow_column->null_count() || (type_hint && type_hint->isNullable())) && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT && arrow_column->type()->id() != arrow::Type::DICTIONARY) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + DataTypePtr nested_type_hint; + if (type_hint) + nested_type_hint = removeNullable(type_hint); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); if (skipped) return {}; auto nullmap_column = readByteMapFromArrowColumn(arrow_column); @@ -435,14 +486,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); - if (read_ints_as_dates) + if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint))) column.type = std::make_shared(); return column; } case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); - if (read_ints_as_dates) + if (type_hint && (isDateOrDate32(type_hint) || isDateTime(type_hint) || isDateTime64(type_hint))) column.type = std::make_shared(); return column; } @@ -454,8 +505,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::MAP: { + DataTypePtr nested_type_hint; + if (type_hint) + { + const auto * map_type_hint = typeid_cast(type_hint.get()); + if (map_type_hint) + nested_type_hint = assert_cast(map_type_hint->getNestedType().get())->getNestedType(); + } auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); if (skipped) return {}; @@ -469,8 +527,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( } case arrow::Type::LIST: { + DataTypePtr nested_type_hint; + if (type_hint) + { + const auto * array_type_hint = typeid_cast(type_hint.get()); + if (array_type_hint) + nested_type_hint = array_type_hint->getNestedType(); + } auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); if (skipped) return {}; auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); @@ -493,11 +558,25 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( Columns tuple_elements; DataTypes tuple_types; std::vector tuple_names; + const auto * tuple_type_hint = type_hint ? typeid_cast(type_hint.get()) : nullptr; for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { + auto field_name = arrow_struct_type->field(i)->name(); + DataTypePtr nested_type_hint; + if (tuple_type_hint) + { + if (tuple_type_hint->haveExplicitNames()) + { + auto pos = tuple_type_hint->tryGetPositionByName(field_name); + if (pos) + nested_type_hint = tuple_type_hint->getElement(*pos); + } + else if (size_t(i) < tuple_type_hint->getElements().size()) + nested_type_hint = tuple_type_hint->getElement(i); + } auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + auto element = readColumnFromArrowColumn(nested_arrow_column, field_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); if (skipped) return {}; tuple_elements.emplace_back(std::move(element.column)); @@ -511,9 +590,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( } case arrow::Type::DICTIONARY: { - auto & dict_values = dictionary_values[column_name]; + auto & dict_info = dictionary_infos[column_name]; + const auto is_lc_nullable = arrow_column->null_count() > 0 || (type_hint && type_hint->isLowCardinalityNullable()); + /// Load dictionary values only once and reuse it. - if (!dict_values) + if (!dict_info.values) { arrow::ArrayVector dict_array; for (size_t chunk_i = 0, num_chunks = static_cast(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) @@ -522,8 +603,22 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); - dict_values = std::make_shared(std::move(dict_column)); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped); + for (size_t i = 0; i != dict_column.column->size(); ++i) + { + if (dict_column.column->isDefaultAt(i)) + { + dict_info.default_value_index = i; + break; + } + } + auto lc_type = std::make_shared(is_lc_nullable ? makeNullable(dict_column.type) : dict_column.type); + auto tmp_lc_column = lc_type->createColumn(); + auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); + dynamic_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); + dict_column.column = std::move(tmp_dict_column); + dict_info.values = std::make_shared(std::move(dict_column)); + dict_info.dictionary_size = arrow_dict_column->length(); } arrow::ArrayVector indexes_array; @@ -534,20 +629,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( } auto arrow_indexes_column = std::make_shared(indexes_array); - auto indexes_column = readColumnWithIndexesData(arrow_indexes_column); - - const auto contains_null = arrow_column->null_count() > 0; - - if (contains_null) - { - auto nullmap_column = readByteMapFromArrowColumn(arrow_column); - - return createLCOfNullableColumnFromArrowDictionaryValues(dict_values, indexes_column, nullmap_column, column_name); - } - else - { - return createLCColumnFromArrowDictionaryValues(dict_values, indexes_column, column_name); - } + auto indexes_column = readColumnWithIndexesData(arrow_indexes_column, dict_info.default_value_index, dict_info.dictionary_size, is_lc_nullable); + auto lc_column = ColumnLowCardinality::create(dict_info.values->column, indexes_column); + auto lc_type = std::make_shared(is_lc_nullable ? makeNullable(dict_info.values->type) : dict_info.values->type); + return {std::move(lc_column), std::move(lc_type), column_name}; } # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ @@ -623,13 +708,13 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); - std::unordered_map> dict_values; + std::unordered_map dict_infos; bool skipped = false; bool allow_null_type = false; if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable()) allow_null_type = true; ColumnWithTypeAndName sample_column = readColumnFromArrowColumn( - arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped); + arrow_column, field->name(), format_name, false, dict_infos, allow_null_type, skip_columns_with_unsupported_types, skipped); if (!skipped) sample_columns.emplace_back(std::move(sample_column)); } @@ -700,9 +785,17 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & { if (!nested_tables.contains(search_nested_table_name)) { + NamesAndTypesList nested_columns; + for (const auto & name_and_type : header.getNamesAndTypesList()) + { + if (name_and_type.name.starts_with(nested_table_name + ".")) + nested_columns.push_back(name_and_type); + } + auto nested_table_type = Nested::collect(nested_columns).front().type; + std::shared_ptr arrow_column = name_to_column_ptr[search_nested_table_name]; ColumnsWithTypeAndName cols = {readColumnFromArrowColumn( - arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)}; + arrow_column, nested_table_name, format_name, false, dictionary_infos, true, false, skipped, nested_table_type)}; BlockPtr block_ptr = std::make_shared(cols); auto column_extractor = std::make_shared(*block_ptr, case_insensitive_matching); nested_tables[search_nested_table_name] = {block_ptr, column_extractor}; @@ -735,7 +828,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & { auto arrow_column = name_to_column_ptr[search_column_name]; column = readColumnFromArrowColumn( - arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped); + arrow_column, header_column.name, format_name, false, dictionary_infos, true, false, skipped, header_column.type); } try diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 695e14b7bba..092ed65d61a 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -44,6 +44,14 @@ public: const Block * hint_header = nullptr, bool ignore_case = false); + struct DictionaryInfo + { + std::shared_ptr values; + Int64 default_value_index = -1; + UInt64 dictionary_size; + }; + + private: const Block & header; const std::string format_name; @@ -55,7 +63,7 @@ private: /// Map {column name : dictionary column}. /// To avoid converting dictionary from Arrow Dictionary /// to LowCardinality every chunk we save it and reuse. - std::unordered_map> dictionary_values; + std::unordered_map dictionary_infos; }; } diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 28f51b9b922..9e423f8a96b 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -235,27 +235,30 @@ namespace DB } template - static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end) + static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift) { const PaddedPODArray & data = assert_cast *>(column.get())->getData(); PaddedPODArray result; result.reserve(end - start); - std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); }); + if (shift) + std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value) - 1; }); + else + std::transform(data.begin() + start, data.begin() + end, std::back_inserter(result), [](T value) { return Int64(value); }); return result; } - static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end) + static PaddedPODArray extractIndexesImpl(ColumnPtr column, size_t start, size_t end, bool shift) { switch (column->getDataType()) { case TypeIndex::UInt8: - return extractIndexesImpl(column, start, end); + return extractIndexesImpl(column, start, end, shift); case TypeIndex::UInt16: - return extractIndexesImpl(column, start, end); + return extractIndexesImpl(column, start, end, shift); case TypeIndex::UInt32: - return extractIndexesImpl(column, start, end); + return extractIndexesImpl(column, start, end, shift); case TypeIndex::UInt64: - return extractIndexesImpl(column, start, end); + return extractIndexesImpl(column, start, end, shift); default: throw Exception(fmt::format("Indexes column must be ColumnUInt, got {}.", column->getName()), ErrorCodes::LOGICAL_ERROR); @@ -267,7 +270,7 @@ namespace DB const String & column_name, ColumnPtr & column, const std::shared_ptr & column_type, - const PaddedPODArray * null_bytemap, + const PaddedPODArray *, arrow::ArrayBuilder * array_builder, String format_name, size_t start, @@ -278,6 +281,7 @@ namespace DB const auto * column_lc = assert_cast(column.get()); arrow::DictionaryBuilder * builder = assert_cast *>(array_builder); auto & dict_values = dictionary_values[column_name]; + bool is_nullable = column_type->isLowCardinalityNullable(); /// Convert dictionary from LowCardinality to Arrow dictionary only once and then reuse it. if (!dict_values) @@ -288,9 +292,9 @@ namespace DB arrow::Status status = MakeBuilder(pool, value_type, &values_builder); checkStatus(status, column->getName(), format_name); - auto dict_column = column_lc->getDictionary().getNestedColumn(); - const auto & dict_type = assert_cast(column_type.get())->getDictionaryType(); - fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values); + auto dict_column = column_lc->getDictionary().getNestedNotNullableColumn(); + const auto & dict_type = removeNullable(assert_cast(column_type.get())->getDictionaryType()); + fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, is_nullable, dict_column->size(), output_string_as_string, dictionary_values); status = values_builder->Finish(&dict_values); checkStatus(status, column->getName(), format_name); } @@ -300,15 +304,14 @@ namespace DB /// AppendIndices in DictionaryBuilder works only with int64_t data, so we cannot use /// fillArrowArray here and should copy all indexes to int64_t container. - auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end); + auto indexes = extractIndexesImpl(column_lc->getIndexesPtr(), start, end, is_nullable); const uint8_t * arrow_null_bytemap_raw_ptr = nullptr; PaddedPODArray arrow_null_bytemap; - if (null_bytemap) + if (column_type->isLowCardinalityNullable()) { - /// Invert values since Arrow interprets 1 as a non-null value, while CH as a null arrow_null_bytemap.reserve(end - start); for (size_t i = start; i < end; ++i) - arrow_null_bytemap.emplace_back(!(*null_bytemap)[i]); + arrow_null_bytemap.emplace_back(!column_lc->isNullAt(i)); arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data(); } @@ -680,7 +683,7 @@ namespace DB { auto nested_type = assert_cast(column_type.get())->getDictionaryType(); const auto * lc_column = assert_cast(column.get()); - const auto & nested_column = lc_column->getDictionaryPtr(); + const auto & nested_column = lc_column->getDictionary().getNestedColumn(); const auto & indexes_column = lc_column->getIndexesPtr(); return arrow::dictionary( getArrowTypeForLowCardinalityIndexes(indexes_column), diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index d8e4b8fe3a5..a0a77a2684e 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -211,7 +211,7 @@ Merge it only if you intend to backport changes to the target branch, otherwise "Assing to assignees of the original PR: %s", ", ".join(user.login for user in self.pr.assignees), ) - self.cherrypick_pr.add_to_assignees(self.pr.assignees) + self.cherrypick_pr.add_to_assignees(*self.pr.assignees) logging.info("Assign to the author of the original PR: %s", self.pr.user.login) self.cherrypick_pr.add_to_assignees(self.pr.user) @@ -249,7 +249,7 @@ Merge it only if you intend to backport changes to the target branch, otherwise "Assing to assignees of the original PR: %s", ", ".join(user.login for user in self.pr.assignees), ) - self.cherrypick_pr.add_to_assignees(self.pr.assignees) + self.cherrypick_pr.add_to_assignees(*self.pr.assignees) logging.info("Assign to the author of the original PR: %s", self.pr.user.login) self.backport_pr.add_to_assignees(self.pr.user) diff --git a/tests/ci/push_to_artifactory.py b/tests/ci/push_to_artifactory.py index 08d0a67f87b..6b407eb5bd8 100755 --- a/tests/ci/push_to_artifactory.py +++ b/tests/ci/push_to_artifactory.py @@ -98,7 +98,7 @@ class Packages: class S3: template = ( - f"{S3_DOWNLOAD}" + f"{S3_DOWNLOAD}/" # "clickhouse-builds/" f"{S3_BUILDS_BUCKET}/" # "33333/" or "21.11/" from --release, if pull request is omitted diff --git a/tests/performance/arrow_format.xml b/tests/performance/arrow_format.xml new file mode 100644 index 00000000000..e501e38a18e --- /dev/null +++ b/tests/performance/arrow_format.xml @@ -0,0 +1,10 @@ + + CREATE TABLE test (uint32 UInt32, n_uint32 Nullable(UInt32), lc LowCardinality(String)) ENGINE=File(Arrow) SETTINGS output_format_arrow_low_cardinality_as_dictionary=1 + insert into test select number, number, toString(number % 10000) from numbers(10000000) + + SELECT uint32 from test format Null + SELECT n_uint32 from test format Null + SELECT lc from test format Null + + DROP TABLE IF EXISTS test + diff --git a/tests/queries/0_stateless/00602_throw_if.reference b/tests/queries/0_stateless/00602_throw_if.reference index ad5aaee89a8..de9ff2dd625 100644 --- a/tests/queries/0_stateless/00602_throw_if.reference +++ b/tests/queries/0_stateless/00602_throw_if.reference @@ -1,3 +1,13 @@ 1 1 -1000000 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1000 diff --git a/tests/queries/0_stateless/00602_throw_if.sh b/tests/queries/0_stateless/00602_throw_if.sh index 1e8850028c4..7d948fa6816 100755 --- a/tests/queries/0_stateless/00602_throw_if.sh +++ b/tests/queries/0_stateless/00602_throw_if.sh @@ -4,9 +4,43 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -default_exception_message="Value passed to 'throwIf' function is non zero" -custom_exception_message="Number equals 1000000" -${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000000) FROM system.numbers" 2>&1 | grep -cF "$default_exception_message" -${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000000, '$custom_exception_message') FROM system.numbers" 2>&1 | grep -v '^(query: ' | grep -cF "$custom_exception_message" -${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT sum(x = 0) FROM (SELECT throwIf(number = 1000000) AS x FROM numbers(1000000))" 2>&1 +default_exception_message="Value passed to 'throwIf' function is non-zero" +custom_exception_message="Number equals 1000" + +${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000) FROM system.numbers" 2>&1 \ + | grep -cF "$default_exception_message" + +${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000, '$custom_exception_message') FROM system.numbers" 2>&1 \ + | grep -v '^(query: ' | grep -cF "$custom_exception_message" + + +# Custom error code arguments are not enabled via configuration. +${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000, '$custom_exception_message', 1) FROM system.numbers" 2>&1 \ + | grep -v '^(query: ' | grep -c "Number of arguments for function throwIf doesn't match: passed 3, should be 1 or 2" + +# Custom error code argument enabled but using the wrong type. +${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000, '$custom_exception_message', 1) FROM system.numbers SETTINGS allow_custom_error_code_in_throwif=true" 2>&1 \ + | grep -v '^(query: ' | grep -c "Third argument of function throwIf must be Int8, Int16 or Int32 (passed: UInt8)" + + +# Normal error code + some weird ones. +# Internal error codes use the upper half of 32-bit int. +custom_error_codes=( + "42" + "0" # OK + "101" # UNEXPECTED_PACKET_FROM_CLIENT (interpreted by client) + "102" # UNEXPECTED_PACKET_FROM_SERVER (interpreted by client) + "1001" # STD_EXCEPTION + "1002" # UNKNOWN_EXCEPTION + "999999" # Unused error code. + "-1") # Also unused. Weird but we should allow throwing negative errors. + +for ec in "${custom_error_codes[@]}" +do + ${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000, '$custom_exception_message', toInt32($ec)) FROM system.numbers SETTINGS allow_custom_error_code_in_throwif=true" 2>&1 \ + | grep -v '^(query: ' | grep -c "Code: $ec.*$custom_exception_message" +done + + +${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT sum(x = 0) FROM (SELECT throwIf(number = 1000) AS x FROM numbers(1000))" 2>&1 diff --git a/tests/queries/0_stateless/00995_exception_while_insert.sh b/tests/queries/0_stateless/00995_exception_while_insert.sh index 28351078df3..e0cd264a2b7 100755 --- a/tests/queries/0_stateless/00995_exception_while_insert.sh +++ b/tests/queries/0_stateless/00995_exception_while_insert.sh @@ -10,6 +10,6 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS check;" $CLICKHOUSE_CLIENT --query="CREATE TABLE check (x UInt64, y UInt64 DEFAULT throwIf(x > 1500000)) ENGINE = Memory;" -seq 1 2000000 | $CLICKHOUSE_CLIENT --query="INSERT INTO check(x) FORMAT TSV" 2>&1 | grep -q "Value passed to 'throwIf' function is non zero." && echo 'OK' || echo 'FAIL' ||: +seq 1 2000000 | $CLICKHOUSE_CLIENT --query="INSERT INTO check(x) FORMAT TSV" 2>&1 | grep -q "Value passed to 'throwIf' function is non-zero." && echo 'OK' || echo 'FAIL' ||: $CLICKHOUSE_CLIENT --query="DROP TABLE check;" diff --git a/tests/queries/0_stateless/02383_arrow_dict_special_cases.reference b/tests/queries/0_stateless/02383_arrow_dict_special_cases.reference new file mode 100644 index 00000000000..61b28e429f3 --- /dev/null +++ b/tests/queries/0_stateless/02383_arrow_dict_special_cases.reference @@ -0,0 +1,32 @@ +dict LowCardinality(Nullable(String)) +one +two +three +one +two +dict LowCardinality(Nullable(String)) + +one +two +three + +one +three +dict LowCardinality(Nullable(String)) +one +two + +three +one +two +three +lc LowCardinality(Nullable(String)) +OK +dict LowCardinality(Nullable(String)) +one +two + +three +one +\N +three diff --git a/tests/queries/0_stateless/02383_arrow_dict_special_cases.sh b/tests/queries/0_stateless/02383_arrow_dict_special_cases.sh new file mode 100755 index 00000000000..86f95873f14 --- /dev/null +++ b/tests/queries/0_stateless/02383_arrow_dict_special_cases.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +mkdir -p $USER_FILES_PATH/test_02383 +cp $CURDIR/data_arrow/dictionary*.arrow $USER_FILES_PATH/test_02383/ +cp $CURDIR/data_arrow/corrupted.arrow $USER_FILES_PATH/test_02383/ +cp $CURDIR/data_arrow/dict_with_nulls.arrow $USER_FILES_PATH/test_02383/ + +$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary1.arrow')" +$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary1.arrow')" +$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary2.arrow')" +$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary2.arrow')" +$CLICKHOUSE_CLIENT -q "desc file('test_02383/dictionary3.arrow')" +$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dictionary3.arrow')" + +$CLICKHOUSE_CLIENT -q "desc file('test_02383/corrupted.arrow')" +$CLICKHOUSE_CLIENT -q "select * from file('test_02383/corrupted.arrow')" 2>&1 | grep -F -q "INCORRECT_DATA" && echo OK || echo FAIL + +$CLICKHOUSE_CLIENT -q "desc file('test_02383/dict_with_nulls.arrow')" +$CLICKHOUSE_CLIENT -q "select * from file('test_02383/dict_with_nulls.arrow')" + + +rm -rf $USER_FILES_PATH/test_02383 diff --git a/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.reference b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.reference new file mode 100644 index 00000000000..e88b6d60a6b --- /dev/null +++ b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.reference @@ -0,0 +1,4 @@ +lc LowCardinality(Nullable(String)) +abc +lc LowCardinality(Nullable(String)) +abc diff --git a/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.sql b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.sql new file mode 100644 index 00000000000..975e7fb8826 --- /dev/null +++ b/tests/queries/0_stateless/02384_nullable_low_cardinality_as_dict_in_arrow.sql @@ -0,0 +1,8 @@ +-- Tags: no-fasttest + +insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=0, engine_file_truncate_on_insert=1; +desc file(02384_data.arrow); +select * from file(02384_data.arrow); +insert into function file(02384_data.arrow) select toLowCardinality(toNullable('abc')) as lc settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=1, engine_file_truncate_on_insert=1; +desc file(02384_data.arrow); +select * from file(02384_data.arrow); diff --git a/tests/queries/0_stateless/data_arrow/corrupted.arrow b/tests/queries/0_stateless/data_arrow/corrupted.arrow new file mode 100644 index 00000000000..a31a6eb93dc Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/corrupted.arrow differ diff --git a/tests/queries/0_stateless/data_arrow/dict_with_nulls.arrow b/tests/queries/0_stateless/data_arrow/dict_with_nulls.arrow new file mode 100644 index 00000000000..546deaf97a8 Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/dict_with_nulls.arrow differ diff --git a/tests/queries/0_stateless/data_arrow/dictionary1.arrow b/tests/queries/0_stateless/data_arrow/dictionary1.arrow new file mode 100644 index 00000000000..5fa1ed90729 Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/dictionary1.arrow differ diff --git a/tests/queries/0_stateless/data_arrow/dictionary2.arrow b/tests/queries/0_stateless/data_arrow/dictionary2.arrow new file mode 100644 index 00000000000..1aed5591c9b Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/dictionary2.arrow differ diff --git a/tests/queries/0_stateless/data_arrow/dictionary3.arrow b/tests/queries/0_stateless/data_arrow/dictionary3.arrow new file mode 100644 index 00000000000..55444337999 Binary files /dev/null and b/tests/queries/0_stateless/data_arrow/dictionary3.arrow differ