#include #include #include #include #include #include #include namespace DB { Set::Type Set::chooseMethod(Columns & key_columns, bool & keys_fit_128_bits, Sizes & key_sizes) { size_t keys_size = key_columns.size(); keys_fit_128_bits = true; size_t keys_bytes = 0; key_sizes.resize(keys_size); for (size_t j = 0; j < keys_size; ++j) { if (!key_columns[j]->isNumeric()) { keys_fit_128_bits = false; break; } key_sizes[j] = key_columns[j]->sizeOfField(); keys_bytes += key_sizes[j]; } if (keys_bytes > 16) keys_fit_128_bits = false; /// Если есть один ключ, который помещается в 64 бита, и это не число с плавающей запятой if (keys_size == 1 && key_columns[0]->isNumeric() && !dynamic_cast(&*key_columns[0]) && !dynamic_cast(&*key_columns[0])) return KEY_64; /// Если есть один строковый ключ, то используем хэш-таблицу с ним if (keys_size == 1 && (dynamic_cast(&*key_columns[0]) || dynamic_cast(&*key_columns[0]))) return KEY_STRING; /// Если много ключей - будем строить множество хэшей от них return HASHED; } void Set::create(BlockInputStreamPtr stream) { LOG_TRACE(log, "Creating set"); Stopwatch watch; size_t entries = 0; /// Читаем все данные while (Block block = stream->read()) { size_t keys_size = block.columns(); Row key(keys_size); Columns key_columns(keys_size); data_types.resize(keys_size); /// Запоминаем столбцы, с которыми будем работать for (size_t i = 0; i < keys_size; ++i) { key_columns[i] = block.getByPosition(i).column; data_types[i] = block.getByPosition(i).type; } size_t rows = block.rows(); /// Какую структуру данных для множества использовать? bool keys_fit_128_bits = false; Sizes key_sizes; type = chooseMethod(key_columns, keys_fit_128_bits, key_sizes); if (type == KEY_64) { SetUInt64 & res = key64; const FieldVisitorToUInt64 visitor; IColumn & column = *key_columns[0]; /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ Field field = column[i]; UInt64 key = boost::apply_visitor(visitor, field); res.insert(key); } entries = res.size(); } else if (type == KEY_STRING) { SetString & res = key_string; IColumn & column = *key_columns[0]; if (const ColumnString * column_string = dynamic_cast(&column)) { const ColumnString::Offsets_t & offsets = column_string->getOffsets(); const ColumnUInt8::Container_t & data = dynamic_cast(column_string->getData()).getData(); /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ StringRef ref(&data[i == 0 ? 0 : offsets[i - 1]], (i == 0 ? offsets[i] : (offsets[i] - offsets[i - 1])) - 1); SetString::iterator it; bool inserted; res.emplace(ref, it, inserted); if (inserted) it->data = string_pool.insert(ref.data, ref.size); } } else if (const ColumnFixedString * column_string = dynamic_cast(&column)) { size_t n = column_string->getN(); const ColumnUInt8::Container_t & data = dynamic_cast(column_string->getData()).getData(); /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ StringRef ref(&data[i * n], n); SetString::iterator it; bool inserted; res.emplace(ref, it, inserted); if (inserted) it->data = string_pool.insert(ref.data, ref.size); } } else throw Exception("Illegal type of column when creating set with string key: " + column.getName(), ErrorCodes::ILLEGAL_COLUMN); entries = res.size(); } else if (type == HASHED) { SetHashed & res = hashed; const FieldVisitorToUInt64 to_uint64_visitor; /// Для всех строчек for (size_t i = 0; i < rows; ++i) res.insert(pack128(i, keys_fit_128_bits, keys_size, key, key_columns, key_sizes)); entries = res.size(); } else if (type == GENERIC) { /// Общий способ SetGeneric & res = generic; /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ for (size_t j = 0; j < keys_size; ++j) key[j] = (*key_columns[j])[i]; res.insert(key); } entries = res.size(); } else throw Exception("Unknown set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT); } logProfileInfo(watch, *stream, entries); } void Set::logProfileInfo(Stopwatch & watch, IBlockInputStream & in, size_t entries) { /// Выведем информацию о том, сколько считано строк и байт. size_t rows = 0; size_t bytes = 0; in.getLeafRowsBytes(rows, bytes); size_t head_rows = 0; if (IProfilingBlockInputStream * profiling_in = dynamic_cast(&in)) head_rows = profiling_in->getInfo().rows; if (rows != 0) { LOG_DEBUG(log, std::fixed << std::setprecision(3) << "Created set with " << entries << " entries from " << head_rows << " rows." << " Read " << rows << " rows, " << bytes / 1048576.0 << " MiB in " << watch.elapsedSeconds() << " sec., " << static_cast(rows / watch.elapsedSeconds()) << " rows/sec., " << bytes / 1048576.0 / watch.elapsedSeconds() << " MiB/sec."); } } void Set::execute(Block & block, const ColumnNumbers & arguments, size_t result, bool negative) const { LOG_TRACE(log, "Checking set membership for block."); ColumnUInt8 * c_res = new ColumnUInt8; block.getByPosition(result).column = c_res; ColumnUInt8::Container_t & vec_res = c_res->getData(); vec_res.resize(block.getByPosition(arguments[0]).column->size()); size_t keys_size = arguments.size(); Row key(keys_size); Columns key_columns(keys_size); /// Запоминаем столбцы, с которыми будем работать. Также проверим, что типы данных правильные. for (size_t i = 0; i < keys_size; ++i) { key_columns[i] = block.getByPosition(arguments[i]).column; if (data_types[i]->getName() != block.getByPosition(arguments[i]).type->getName()) throw Exception("Types in section IN doesn't match.", ErrorCodes::TYPE_MISMATCH); } size_t rows = block.rows(); /// Какую структуру данных для множества использовать? bool keys_fit_128_bits = false; Sizes key_sizes; if (type != chooseMethod(key_columns, keys_fit_128_bits, key_sizes)) throw Exception("Incompatible columns in IN section", ErrorCodes::INCOMPATIBLE_COLUMNS); if (type == KEY_64) { const SetUInt64 & set = key64; const FieldVisitorToUInt64 visitor; IColumn & column = *key_columns[0]; /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ Field field = column[i]; UInt64 key = boost::apply_visitor(visitor, field); vec_res[i] = negative ^ (set.end() != set.find(key)); } } else if (type == KEY_STRING) { const SetString & set = key_string; IColumn & column = *key_columns[0]; if (const ColumnString * column_string = dynamic_cast(&column)) { const ColumnString::Offsets_t & offsets = column_string->getOffsets(); const ColumnUInt8::Container_t & data = dynamic_cast(column_string->getData()).getData(); /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ StringRef ref(&data[i == 0 ? 0 : offsets[i - 1]], (i == 0 ? offsets[i] : (offsets[i] - offsets[i - 1])) - 1); vec_res[i] = negative ^ (set.end() != set.find(ref)); } } else if (const ColumnFixedString * column_string = dynamic_cast(&column)) { size_t n = column_string->getN(); const ColumnUInt8::Container_t & data = dynamic_cast(column_string->getData()).getData(); /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ StringRef ref(&data[i * n], n); vec_res[i] = negative ^ (set.end() != set.find(ref)); } } else throw Exception("Illegal type of column when creating set with string key: " + column.getName(), ErrorCodes::ILLEGAL_COLUMN); } else if (type == HASHED) { const SetHashed & set = hashed; const FieldVisitorToUInt64 to_uint64_visitor; /// Для всех строчек for (size_t i = 0; i < rows; ++i) vec_res[i] = negative ^ (set.end() != set.find(pack128(i, keys_fit_128_bits, keys_size, key, key_columns, key_sizes))); } else if (type == GENERIC) { /// Общий способ const SetGeneric & set = generic; /// Для всех строчек for (size_t i = 0; i < rows; ++i) { /// Строим ключ for (size_t j = 0; j < keys_size; ++j) key[j] = (*key_columns[j])[i]; vec_res[i] = negative ^ (set.end() != set.find(key)); } } else throw Exception("Unknown set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT); LOG_TRACE(log, "Checked set membership for block."); } }