null_as_default for Values

This commit is contained in:
Alexander Tokmakov 2019-10-16 22:52:00 +03:00
parent 102cbfd25a
commit e9e8d055af
7 changed files with 112 additions and 52 deletions

View File

@ -77,7 +77,6 @@ protected:
private:
Params params;
Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE};
size_t total_rows = 0;
size_t num_errors = 0;

View File

@ -151,7 +151,7 @@ private:
{
info.special_parser.is_array = true;
info.type = applyVisitor(FieldToDataType(), info.literal->value);
auto nested_type = dynamic_cast<const DataTypeArray &>(*info.type).getNestedType();
auto nested_type = assert_cast<const DataTypeArray &>(*info.type).getNestedType();
/// It can be Array(Nullable(nested_type))
bool array_of_nullable = false;
@ -212,9 +212,9 @@ private:
/// E.g. template of "position('some string', 'other string') != 0" is
/// ["position", "(", DataTypeString, ",", DataTypeString, ")", "!=", DataTypeUInt64]
ConstantExpressionTemplate::TemplateStructure::TemplateStructure(LiteralsInfo & replaced_literals, TokenIterator expression_begin, TokenIterator expression_end,
ASTPtr & expression, const IDataType & result_type, const Context & context)
ASTPtr & expression, const IDataType & result_type, bool null_as_default_, const Context & context)
{
null_as_default = null_as_default_;
std::sort(replaced_literals.begin(), replaced_literals.end(), [](const LiteralInfo & a, const LiteralInfo & b)
{
@ -252,16 +252,17 @@ ConstantExpressionTemplate::TemplateStructure::TemplateStructure(LiteralsInfo &
++prev_end;
}
addNodesToCastResult(result_type, expression);
result_column_name = expression->getColumnName();
addNodesToCastResult(result_type, expression, null_as_default);
auto syntax_result = SyntaxAnalyzer(context).analyze(expression, literals.getNamesAndTypesList());
result_column_name = expression->getColumnName();
actions_on_literals = ExpressionAnalyzer(expression, syntax_result, context).getActions(false);
}
size_t ConstantExpressionTemplate::TemplateStructure::getTemplateHash(const ASTPtr & expression,
const LiteralsInfo & replaced_literals,
const DataTypePtr & result_column_type,
bool null_as_default,
const String & salt)
{
/// TODO distinguish expressions with the same AST and different tokens (e.g. "CAST(expr, 'Type')" and "CAST(expr AS Type)")
@ -272,6 +273,7 @@ size_t ConstantExpressionTemplate::TemplateStructure::getTemplateHash(const ASTP
for (const auto & info : replaced_literals)
hash_state.update(info.type->getName());
hash_state.update(null_as_default);
/// Allows distinguish expression in the last column in Values format
hash_state.update(salt);
@ -288,6 +290,7 @@ size_t ConstantExpressionTemplate::TemplateStructure::getTemplateHash(const ASTP
ConstantExpressionTemplate::TemplateStructurePtr
ConstantExpressionTemplate::Cache::getFromCacheOrConstruct(const DataTypePtr & result_column_type,
bool null_as_default,
TokenIterator expression_begin,
TokenIterator expression_end,
const ASTPtr & expression_,
@ -298,17 +301,18 @@ ConstantExpressionTemplate::Cache::getFromCacheOrConstruct(const DataTypePtr & r
TemplateStructurePtr res;
ASTPtr expression = expression_->clone();
ReplaceLiteralsVisitor visitor(context);
visitor.visit(expression, result_column_type->isNullable());
visitor.visit(expression, result_column_type->isNullable() || null_as_default);
ReplaceQueryParameterVisitor param_visitor(context.getQueryParameters());
param_visitor.visit(expression);
size_t template_hash = TemplateStructure::getTemplateHash(expression, visitor.replaced_literals, result_column_type, salt);
size_t template_hash = TemplateStructure::getTemplateHash(expression, visitor.replaced_literals, result_column_type, null_as_default, salt);
auto iter = cache.find(template_hash);
if (iter == cache.end())
{
if (max_size <= cache.size())
cache.clear();
res = std::make_shared<TemplateStructure>(visitor.replaced_literals, expression_begin, expression_end, expression, *result_column_type, context);
res = std::make_shared<TemplateStructure>(visitor.replaced_literals, expression_begin, expression_end,
expression, *result_column_type, null_as_default, context);
cache.insert({template_hash, res});
if (found_in_cache)
*found_in_cache = false;
@ -416,7 +420,7 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType(ReadBuffer & istr, co
const Field & array = ast->as<ASTLiteral &>().value;
auto array_type = applyVisitor(FieldToDataType(), array);
auto nested_type = dynamic_cast<const DataTypeArray &>(*array_type).getNestedType();
auto nested_type = assert_cast<const DataTypeArray &>(*array_type).getNestedType();
if (type_info.is_nullable)
if (auto nullable = dynamic_cast<const DataTypeNullable *>(nested_type.get()))
nested_type = nullable->getNestedType();
@ -488,7 +492,7 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType(ReadBuffer & istr, co
}
}
ColumnPtr ConstantExpressionTemplate::evaluateAll()
ColumnPtr ConstantExpressionTemplate::evaluateAll(BlockMissingValues & nulls, size_t column_idx, size_t offset)
{
Block evaluated = structure->literals.cloneWithColumns(std::move(columns));
columns = structure->literals.cloneEmptyColumns();
@ -506,23 +510,40 @@ ColumnPtr ConstantExpressionTemplate::evaluateAll()
ErrorCodes::LOGICAL_ERROR);
rows_count = 0;
return evaluated.getByName(structure->result_column_name).column->convertToFullColumnIfConst();
ColumnPtr res = evaluated.getByName(structure->result_column_name).column->convertToFullColumnIfConst();
if (!structure->null_as_default)
return res;
/// Extract column with evaluated expression and mask for NULLs
auto & tuple = assert_cast<const ColumnTuple &>(*res);
if (tuple.tupleSize() != 2)
throw Exception("Invalid tuple size, it'a a bug", ErrorCodes::LOGICAL_ERROR);
auto & is_null = assert_cast<const ColumnUInt8 &>(tuple.getColumn(1));
for (size_t i = 0; i < is_null.size(); ++i)
if (is_null.getUInt(i))
nulls.setBit(column_idx, offset + i);
return tuple.getColumnPtr(0);
}
void ConstantExpressionTemplate::TemplateStructure::addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr)
void ConstantExpressionTemplate::TemplateStructure::addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr, bool null_as_default)
{
auto result_type = std::make_shared<ASTLiteral>(result_column_type.getName());
/// Replace "expr" with "CAST(expr, 'TypeName')"
/// or with "(CAST(assumeNotNull(expr as _expression), 'TypeName'), isNull(_expression))" if null_as_default is true
if (null_as_default)
{
expr->setAlias("_expression");
expr = makeASTFunction("assumeNotNull", std::move(expr));
}
auto arguments = std::make_shared<ASTExpressionList>();
arguments->children.push_back(std::move(expr));
arguments->children.push_back(std::move(result_type));
expr = makeASTFunction("CAST", std::move(expr), std::make_shared<ASTLiteral>(result_column_type.getName()));
auto cast = std::make_shared<ASTFunction>();
cast->name = "CAST";
cast->arguments = std::move(arguments);
cast->children.push_back(cast->arguments);
expr = std::move(cast);
if (null_as_default)
{
auto is_null = makeASTFunction("isNull", std::make_shared<ASTIdentifier>("_expression"));
expr = makeASTFunction("tuple", std::move(expr), std::move(is_null));
}
}
}

View File

@ -21,10 +21,11 @@ class ConstantExpressionTemplate : boost::noncopyable
struct TemplateStructure : boost::noncopyable
{
TemplateStructure(LiteralsInfo & replaced_literals, TokenIterator expression_begin, TokenIterator expression_end,
ASTPtr & expr, const IDataType & result_type, const Context & context);
ASTPtr & expr, const IDataType & result_type, bool null_as_default_, const Context & context);
static void addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr);
static size_t getTemplateHash(const ASTPtr & expression, const LiteralsInfo & replaced_literals, const DataTypePtr & result_column_type, const String & salt);
static void addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr, bool null_as_default);
static size_t getTemplateHash(const ASTPtr & expression, const LiteralsInfo & replaced_literals,
const DataTypePtr & result_column_type, bool null_as_default, const String & salt);
String result_column_name;
@ -35,6 +36,7 @@ class ConstantExpressionTemplate : boost::noncopyable
ExpressionActionsPtr actions_on_literals;
std::vector<SpecialParserType> special_parser;
bool null_as_default;
};
public:
@ -50,6 +52,7 @@ public:
/// Deduce template of expression of type result_column_type and add it to cache (or use template from cache)
TemplateStructurePtr getFromCacheOrConstruct(const DataTypePtr & result_column_type,
bool null_as_default,
TokenIterator expression_begin,
TokenIterator expression_end,
const ASTPtr & expression_,
@ -65,8 +68,9 @@ public:
/// and parse literals into temporary columns
bool parseExpression(ReadBuffer & istr, const FormatSettings & settings);
/// Evaluate batch of expressions were parsed using template
ColumnPtr evaluateAll();
/// Evaluate batch of expressions were parsed using template.
/// If template was deduced with null_as_default == true, set bits in nulls for NULL values in column_idx, starting from offset.
ColumnPtr evaluateAll(BlockMissingValues & nulls, size_t column_idx, size_t offset = 0);
size_t rowsCount() const { return rows_count; }

View File

@ -11,6 +11,7 @@
#include <Common/typeid_cast.h>
#include <common/find_symbols.h>
#include <Parsers/ASTLiteral.h>
#include <DataTypes/DataTypeNullable.h>
namespace DB
@ -47,6 +48,7 @@ Chunk ValuesBlockInputFormat::generate()
{
const Block & header = getPort().getHeader();
MutableColumns columns = header.cloneEmptyColumns();
block_missing_values.clear();
for (size_t rows_in_block = 0; rows_in_block < params.max_block_size; ++rows_in_block)
{
@ -55,7 +57,7 @@ Chunk ValuesBlockInputFormat::generate()
skipWhitespaceIfAny(buf);
if (buf.eof() || *buf.position() == ';')
break;
readRow(columns);
readRow(columns, rows_in_block);
if (params.callback)
params.callback();
}
@ -73,10 +75,10 @@ Chunk ValuesBlockInputFormat::generate()
if (!templates[i] || !templates[i]->rowsCount())
continue;
if (columns[i]->empty())
columns[i] = std::move(*templates[i]->evaluateAll()).mutate();
columns[i] = std::move(*templates[i]->evaluateAll(block_missing_values, i)).mutate();
else
{
ColumnPtr evaluated = templates[i]->evaluateAll();
ColumnPtr evaluated = templates[i]->evaluateAll(block_missing_values, i, columns[i]->size());
columns[i]->insertRangeFrom(*evaluated, 0, evaluated->size());
}
}
@ -91,7 +93,7 @@ Chunk ValuesBlockInputFormat::generate()
return Chunk{std::move(columns), rows_in_block};
}
void ValuesBlockInputFormat::readRow(MutableColumns & columns)
void ValuesBlockInputFormat::readRow(MutableColumns & columns, size_t row_num)
{
assertChar('(', buf);
@ -99,17 +101,22 @@ void ValuesBlockInputFormat::readRow(MutableColumns & columns)
{
skipWhitespaceIfAny(buf);
PeekableReadBufferCheckpoint checkpoint{buf};
bool read;
/// Parse value using fast streaming parser for literals and slow SQL parser for expressions.
/// If there is SQL expression in some row, template of this expression will be deduced,
/// so it makes possible to parse the following rows much faster
/// if expressions in the following rows have the same structure
if (parser_type_for_column[column_idx] == ParserType::Streaming)
tryReadValue(*columns[column_idx], column_idx);
read = tryReadValue(*columns[column_idx], column_idx);
else if (parser_type_for_column[column_idx] == ParserType::BatchTemplate)
tryParseExpressionUsingTemplate(columns[column_idx], column_idx);
read = tryParseExpressionUsingTemplate(columns[column_idx], column_idx);
else /// if (parser_type_for_column[column_idx] == ParserType::SingleExpressionEvaluation)
parseExpression(*columns[column_idx], column_idx);
read = parseExpression(*columns[column_idx], column_idx);
if (!read)
block_missing_values.setBit(column_idx, row_num);
/// If read is true, value still may be missing. Bit mask for these values will be copied from ConstantExpressionTemplate later.
}
skipWhitespaceIfAny(buf);
@ -119,22 +126,22 @@ void ValuesBlockInputFormat::readRow(MutableColumns & columns)
++total_rows;
}
void ValuesBlockInputFormat::tryParseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx)
bool ValuesBlockInputFormat::tryParseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx)
{
/// Try to parse expression using template if one was successfully deduced while parsing the first row
if (templates[column_idx]->parseExpression(buf, format_settings))
{
++rows_parsed_using_template[column_idx];
return;
return true;
}
/// Expression in the current row is not match template deduced on the first row.
/// Evaluate expressions, which were parsed using this template.
if (column->empty())
column = std::move(*templates[column_idx]->evaluateAll()).mutate();
column = std::move(*templates[column_idx]->evaluateAll(block_missing_values, column_idx)).mutate();
else
{
ColumnPtr evaluated = templates[column_idx]->evaluateAll();
ColumnPtr evaluated = templates[column_idx]->evaluateAll(block_missing_values, column_idx, column->size());
column->insertRangeFrom(*evaluated, 0, evaluated->size());
}
/// Do not use this template anymore
@ -142,19 +149,25 @@ void ValuesBlockInputFormat::tryParseExpressionUsingTemplate(MutableColumnPtr &
buf.rollbackToCheckpoint();
/// It will deduce new template or fallback to slow SQL parser
parseExpression(*column, column_idx);
return parseExpression(*column, column_idx);
}
void ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
{
bool rollback_on_exception = false;
try
{
types[column_idx]->deserializeAsTextQuoted(column, buf, format_settings);
bool read = true;
const auto & type = types[column_idx];
if (format_settings.null_as_default && !type->isNullable())
read = DataTypeNullable::deserializeTextQuoted(column, buf, format_settings, type);
else
type->deserializeAsTextQuoted(column, buf, format_settings);
rollback_on_exception = true;
skipWhitespaceIfAny(buf);
assertDelimiterAfterValue(column_idx);
return read;
}
catch (const Exception & e)
{
@ -166,12 +179,11 @@ void ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
/// Switch to SQL parser and don't try to use streaming parser for complex expressions
/// Note: Throwing exceptions for each expression may be very slow because of stacktraces
buf.rollbackToCheckpoint();
parseExpression(column, column_idx);
return parseExpression(column, column_idx);
}
}
void
ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
{
const Block & header = getPort().getHeader();
const IDataType & type = *header.getByPosition(column_idx).type;
@ -223,7 +235,7 @@ ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
if (ok)
{
parser_type_for_column[column_idx] = ParserType::Streaming;
return;
return true;
}
else if (rollback_on_exception)
column.popBack(1);
@ -243,7 +255,8 @@ ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
bool found_in_cache = false;
const auto & result_type = header.getByPosition(column_idx).type;
const char * delimiter = (column_idx + 1 == num_columns) ? ")" : ",";
auto structure = templates_cache.getFromCacheOrConstruct(result_type, TokenIterator(tokens), token_iterator,
auto structure = templates_cache.getFromCacheOrConstruct(result_type, format_settings.null_as_default,
TokenIterator(tokens), token_iterator,
ast, *context, &found_in_cache, delimiter);
templates[column_idx].emplace(structure);
if (found_in_cache)
@ -256,7 +269,7 @@ ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
{
++rows_parsed_using_template[column_idx];
parser_type_for_column[column_idx] = ParserType::BatchTemplate;
return;
return true;
}
}
catch (...)
@ -290,6 +303,11 @@ ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
/// Check that we are indeed allowed to insert a NULL.
if (value.isNull() && !type.isNullable())
{
if (format_settings.null_as_default)
{
type.insertDefaultInto(column);
return false;
}
buf.rollbackToCheckpoint();
throw Exception{"Cannot insert NULL value into a column of type '" + type.getName() + "'"
+ " at: " +
@ -298,6 +316,7 @@ ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
}
column.insert(value);
return true;
}
/// Can be used in fileSegmentationEngine for parallel parsing of Values

View File

@ -33,6 +33,8 @@ public:
String getName() const override { return "ValuesBlockInputFormat"; }
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
private:
enum class ParserType
{
@ -45,11 +47,11 @@ private:
Chunk generate() override;
void readRow(MutableColumns & columns);
void readRow(MutableColumns & columns, size_t row_num);
void tryParseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx);
ALWAYS_INLINE inline void tryReadValue(IColumn & column, size_t column_idx);
void parseExpression(IColumn & column, size_t column_idx);
bool tryParseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx);
ALWAYS_INLINE inline bool tryReadValue(IColumn & column, size_t column_idx);
bool parseExpression(IColumn & column, size_t column_idx);
ALWAYS_INLINE inline void assertDelimiterAfterValue(size_t column_idx);
ALWAYS_INLINE inline bool checkDelimiterAfterValue(size_t column_idx);
@ -81,6 +83,8 @@ private:
ConstantExpressionTemplate::Cache templates_cache;
DataTypes types;
BlockMissingValues block_missing_values;
};
}

View File

@ -23,3 +23,8 @@ Template (Quoted)
1 world 3 2019-07-23 [1,2,3] ('tuple',3.14)
2 Hello 123 2019-06-19 [] ('test',2.71828)
3 Hello 42 2019-06-19 [1,2,3] ('default',0.75)
Values
0 1 42 2019-07-22 [10,20,30] ('default',0)
1 world 3 2019-07-23 [1,2,3] ('tuple',3.14)
2 Hello 123 2019-06-19 [] ('test',2.71828)
3 Hello 42 2019-06-19 [1,2,3] ('default',0.75)

View File

@ -44,4 +44,12 @@ echo 'NULL, '\''1'\'', null, '\''2019-07-22'\'', [10, 20, 30], NuLl
2, null, 123, null, [], ('\''test'\'', 2.71828)
3, null, null, null, null, null' | $CLICKHOUSE_CLIENT --input_format_null_as_default=1 --format_custom_escaping_rule=Quoted --format_custom_field_delimiter=', ' --query="INSERT INTO null_as_default FORMAT CustomSeparated";
$CLICKHOUSE_CLIENT --query="SELECT * FROM null_as_default ORDER BY i";
$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE null_as_default";
echo 'Values'
echo '(NULL, '\''1'\'', (null), '\''2019-07-22'\'', ([10, 20, 30]), (NuLl)),
(1, '\''world'\'', (3), '\''2019-07-23'\'', (NULL), (('\''tuple'\'', 3.14))),
(2, null, (123), null, ([]), (('\''test'\'', 2.71828))),
(3, null, (null), null, (null), (null))' | $CLICKHOUSE_CLIENT --input_format_null_as_default=1 --input_format_values_deduce_templates_of_expressions=1 --query="INSERT INTO null_as_default VALUES";
$CLICKHOUSE_CLIENT --query="SELECT * FROM null_as_default ORDER BY i";
$CLICKHOUSE_CLIENT --query="DROP TABLE null_as_default";