mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 16:12:01 +00:00
Merge branch 'master' into fix-logging
This commit is contained in:
commit
410eb44f0f
@ -1,6 +0,0 @@
|
||||
# ARM (AArch64) build works on Amazon Graviton, Oracle Cloud, Huawei Cloud ARM machines.
|
||||
# The support for AArch64 is pre-production ready.
|
||||
|
||||
wget 'https://builds.clickhouse.com/master/aarch64/clickhouse'
|
||||
chmod a+x ./clickhouse
|
||||
sudo ./clickhouse install
|
@ -1,3 +0,0 @@
|
||||
fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse'
|
||||
chmod a+x ./clickhouse
|
||||
su -m root -c './clickhouse install'
|
@ -1,3 +0,0 @@
|
||||
wget 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse'
|
||||
chmod a+x ./clickhouse
|
||||
./clickhouse
|
@ -1,3 +0,0 @@
|
||||
wget 'https://builds.clickhouse.com/master/macos/clickhouse'
|
||||
chmod a+x ./clickhouse
|
||||
./clickhouse
|
@ -43,7 +43,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
|
||||
|
||||
As of April 2023, any version of Clang >= 15 will work.
|
||||
GCC as a compiler is not supported
|
||||
GCC as a compiler is not supported.
|
||||
To build with a specific Clang version:
|
||||
|
||||
:::tip
|
||||
@ -114,18 +114,3 @@ mkdir build
|
||||
cmake -S . -B build
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
## You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse}
|
||||
|
||||
ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
|
||||
|
||||
The CI checks build the binaries on each commit to [ClickHouse](https://github.com/clickhouse/clickhouse/). To download them:
|
||||
|
||||
1. Open the [commits list](https://github.com/ClickHouse/ClickHouse/commits/master)
|
||||
1. Choose a **Merge pull request** commit that includes the new feature, or was added after the new feature
|
||||
1. Click the status symbol (yellow dot, red x, green check) to open the CI check list
|
||||
1. Scroll through the list until you find **ClickHouse build check x/x artifact groups are OK**
|
||||
1. Click **Details**
|
||||
1. Find the type of package for your operating system that you need and download the files.
|
||||
|
||||
![build artifact check](images/find-build-artifact.png)
|
||||
|
@ -28,23 +28,25 @@ The quickest and easiest way to get up and running with ClickHouse is to create
|
||||
For production installs of a specific release version see the [installation options](#available-installation-options) down below.
|
||||
:::
|
||||
|
||||
On Linux and macOS:
|
||||
On Linux, macOS and FreeBSD:
|
||||
|
||||
1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, clickhouse-client, clickhouse-local,
|
||||
ClickHouse Keeper, and other tools:
|
||||
1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the
|
||||
following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server,
|
||||
clickhouse-client, clickhouse-local, ClickHouse Keeper, and other tools:
|
||||
|
||||
```bash
|
||||
curl https://clickhouse.com/ | sh
|
||||
```
|
||||
|
||||
1. Run the following command to start the ClickHouse server:
|
||||
|
||||
```bash
|
||||
./clickhouse server
|
||||
```
|
||||
|
||||
The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.
|
||||
The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.
|
||||
|
||||
1. Open a new terminal and use the **clickhouse-client** to connect to your service:
|
||||
1. Open a new terminal and use the **./clickhouse client** to connect to your service:
|
||||
|
||||
```bash
|
||||
./clickhouse client
|
||||
@ -330,7 +332,9 @@ For production environments, it’s recommended to use the latest `stable`-versi
|
||||
|
||||
To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside.
|
||||
|
||||
### From Sources {#from-sources}
|
||||
## Non-Production Deployments (Advanced)
|
||||
|
||||
### Compile From Source {#from-sources}
|
||||
|
||||
To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [macOS](/docs/en/development/build-osx.md).
|
||||
|
||||
@ -346,8 +350,33 @@ You’ll need to create data and metadata folders manually and `chown` them for
|
||||
|
||||
On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources.
|
||||
|
||||
### From CI checks pre-built binaries
|
||||
ClickHouse binaries are built for each [commit](/docs/en/development/build.md#you-dont-have-to-build-clickhouse).
|
||||
### Install a CI-generated Binary
|
||||
|
||||
ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit in the [ClickHouse
|
||||
repository](https://github.com/clickhouse/clickhouse/), e.g. [sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug)
|
||||
builds, cross-compiled builds etc. While such builds are normally only useful during development, they can in certain situations also be
|
||||
interesting for users.
|
||||
|
||||
:::note
|
||||
Since ClickHouse's CI is evolving over time, the exact steps to download CI-generated builds may vary.
|
||||
Also, CI may delete too old build artifacts, making them unavailable for download.
|
||||
:::
|
||||
|
||||
For example, to download a aarch64 binary for ClickHouse v23.4, follow these steps:
|
||||
|
||||
- Find the GitHub pull request for release v23.4: [Release pull request for branch 23.4](https://github.com/ClickHouse/ClickHouse/pull/49238)
|
||||
- Click "Commits", then click a commit similar to "Update autogenerated version to 23.4.2.1 and contributors" for the particular version you like to install.
|
||||
- Click the green check / yellow dot / red cross to open the list of CI checks.
|
||||
- Click "Details" next to "ClickHouse Build Check" in the list, it will open a page similar to [this page](https://s3.amazonaws.com/clickhouse-test-reports/46793/b460eb70bf29b19eadd19a1f959b15d186705394/clickhouse_build_check/report.html)
|
||||
- Find the rows with compiler = "clang-*-aarch64" - there are multiple rows.
|
||||
- Download the artifacts for these builds.
|
||||
|
||||
To download binaries for very old x86-64 systems without [SSE3](https://en.wikipedia.org/wiki/SSE3) support or old ARM systems without
|
||||
[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a [pull
|
||||
request](https://github.com/ClickHouse/ClickHouse/commits/master) and find CI check "BuilderBinAmd64Compat", respectively
|
||||
"BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs
|
||||
https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the
|
||||
build.
|
||||
|
||||
## Launch {#launch}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -577,7 +577,7 @@ Default value: 20
|
||||
|
||||
**Usage**
|
||||
|
||||
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
|
||||
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
|
||||
|
||||
## max_part_loading_threads {#max-part-loading-threads}
|
||||
|
||||
@ -840,4 +840,4 @@ Possible values:
|
||||
|
||||
- `Always` or `Never`.
|
||||
|
||||
Default value: `Never`
|
||||
Default value: `Never`
|
||||
|
@ -560,6 +560,77 @@ Result:
|
||||
└───────────────────────────┘
|
||||
```
|
||||
|
||||
## Entropy-learned hashing (experimental)
|
||||
|
||||
Entropy-learned hashing is disabled by default, to enable: `SET allow_experimental_hash_functions=1`.
|
||||
|
||||
Entropy-learned hashing is not a standalone hash function like `metroHash64`, `cityHash64`, `sipHash64` etc. Instead, it aims to preprocess
|
||||
the data to be hashed in a way that a standalone hash function can be computed more efficiently while not compromising the hash quality,
|
||||
i.e. the randomness of the hashes. For that, entropy-based hashing chooses a subset of the bytes in a training data set of Strings which has
|
||||
the same randomness (entropy) as the original Strings. For example, if the Strings are in average 100 bytes long, and we pick a subset of 5
|
||||
bytes, then a hash function will be 95% less expensive to evaluate. For details of the method, refer to [Entropy-Learned Hashing: Constant
|
||||
Time Hashing with Controllable Uniformity](https://doi.org/10.1145/3514221.3517894).
|
||||
|
||||
Entropy-learned hashing has two phases:
|
||||
|
||||
1. A training phase on a representative but typically small set of Strings to be hashed. Training consists of two steps:
|
||||
|
||||
- Function `prepareTrainEntropyLearnedHash(data, id)` caches the training data in a global state under a given `id`. It returns dummy
|
||||
value `0` on every row.
|
||||
- Function `trainEntropyLearnedHash(id)` computes a minimal partial sub-key of the training data stored stored under `id` in the global
|
||||
state. The cached training data in the global state is replaced by the partial key. Dummy value `0` is returned on every row.
|
||||
|
||||
2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)`
|
||||
hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function.
|
||||
|
||||
The reason that the training phase comprises two steps is that ClickHouse processes data at chunk granularity but entropy-learned hashing
|
||||
needs to process the entire training set at once.
|
||||
|
||||
Since functions `prepareTrainEntropyLearnedHash()` and `trainEntropyLearnedHash()` access global state, they should not be called in
|
||||
parallel with the same `id`.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
prepareTrainEntropyLearnedHash(data, id);
|
||||
trainEntropyLearnedHash(id);
|
||||
entropyLearnedHash(data, id);
|
||||
```
|
||||
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SET allow_experimental_hash_functions=1;
|
||||
CREATE TABLE tab (col String) ENGINE=Memory;
|
||||
INSERT INTO tab VALUES ('aa'), ('ba'), ('ca');
|
||||
|
||||
SELECT prepareTrainEntropyLearnedHash(col, 'id1') AS prepared FROM tab;
|
||||
SELECT trainEntropyLearnedHash('id1') AS trained FROM tab;
|
||||
SELECT entropyLearnedHash(col, 'id1') as hashes FROM tab;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` response
|
||||
┌─prepared─┐
|
||||
│ 0 │
|
||||
│ 0 │
|
||||
│ 0 │
|
||||
└──────────┘
|
||||
|
||||
┌─trained─┐
|
||||
│ 0 │
|
||||
│ 0 │
|
||||
│ 0 │
|
||||
└─────────┘
|
||||
|
||||
┌───────────────hashes─┐
|
||||
│ 2603192927274642682 │
|
||||
│ 4947675599669400333 │
|
||||
│ 10783339242466472992 │
|
||||
└──────────────────────┘
|
||||
```
|
||||
|
||||
## metroHash64
|
||||
|
||||
Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value.
|
||||
|
395
src/Functions/EntropyLearnedHash.cpp
Normal file
395
src/Functions/EntropyLearnedHash.cpp
Normal file
@ -0,0 +1,395 @@
|
||||
#include <base/defines.h>
|
||||
#include <base/types.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
/// Implementation of entropy-learned hashing: https://doi.org/10.1145/3514221.3517894
|
||||
/// If you change something in this file, please don't deviate too much from the pseudocode in the paper!
|
||||
|
||||
/// TODOs for future work:
|
||||
/// - allow to specify an arbitrary hash function (currently always CityHash is used)
|
||||
/// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash())
|
||||
/// - support more datatypes for data (besides String)
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
using PartialKeyPositions = std::vector<size_t>;
|
||||
using Entropies = std::vector<size_t>;
|
||||
|
||||
void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, String & result)
|
||||
{
|
||||
result.clear();
|
||||
result.reserve(partial_key_positions.size());
|
||||
|
||||
for (auto partial_key_position : partial_key_positions)
|
||||
if (partial_key_position < key.size())
|
||||
result.push_back(key[partial_key_position]);
|
||||
}
|
||||
|
||||
bool allPartialKeysAreUnique(const std::vector<std::string_view> & keys, const PartialKeyPositions & partial_key_positions)
|
||||
{
|
||||
std::unordered_set<String> unique_partial_keys;
|
||||
unique_partial_keys.reserve(keys.size());
|
||||
String partial_key;
|
||||
|
||||
for (const auto & key : keys)
|
||||
{
|
||||
getPartialKey(key, partial_key_positions, partial_key);
|
||||
if (!unique_partial_keys.insert(partial_key).second)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// NextByte returns position of byte which adds the most entropy and the new entropy
|
||||
std::pair<size_t, size_t> nextByte(const std::vector<std::string_view> & keys, size_t max_len, PartialKeyPositions & partial_key_positions)
|
||||
{
|
||||
size_t min_collisions = std::numeric_limits<size_t>::max();
|
||||
size_t best_position = 0;
|
||||
|
||||
std::unordered_map<String, size_t> count_table;
|
||||
count_table.reserve(keys.size());
|
||||
|
||||
String partial_key;
|
||||
|
||||
for (size_t i = 0; i < max_len; ++i)
|
||||
{
|
||||
count_table.clear();
|
||||
|
||||
partial_key_positions.push_back(i);
|
||||
size_t collisions = 0;
|
||||
for (const auto & key : keys)
|
||||
{
|
||||
getPartialKey(key, partial_key_positions, partial_key);
|
||||
collisions += count_table[partial_key]++;
|
||||
}
|
||||
|
||||
if (collisions < min_collisions)
|
||||
{
|
||||
min_collisions = collisions;
|
||||
best_position = i;
|
||||
}
|
||||
partial_key_positions.pop_back();
|
||||
}
|
||||
|
||||
return {best_position, min_collisions};
|
||||
}
|
||||
|
||||
std::pair<PartialKeyPositions, Entropies> chooseBytes(const std::vector<std::string_view> & train_data)
|
||||
{
|
||||
if (train_data.size() <= 1)
|
||||
return {};
|
||||
|
||||
PartialKeyPositions partial_key_positions;
|
||||
Entropies entropies;
|
||||
|
||||
size_t max_len = 0; /// length of the longest key in training data
|
||||
for (const auto & key : train_data)
|
||||
max_len = std::max(max_len, key.size());
|
||||
|
||||
while (!allPartialKeysAreUnique(train_data, partial_key_positions))
|
||||
{
|
||||
auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions);
|
||||
if (!entropies.empty() && new_entropy == entropies.back())
|
||||
break;
|
||||
partial_key_positions.push_back(new_position);
|
||||
entropies.push_back(new_entropy);
|
||||
}
|
||||
return {partial_key_positions, entropies};
|
||||
}
|
||||
|
||||
/// Contains global state to convey information between SQL functions
|
||||
/// - prepareTrainEntropyLearnedHash(),
|
||||
/// - trainEntropyLearnedHash() and
|
||||
/// - entropyLearnedHash().
|
||||
///
|
||||
/// The reason this machinery is necessary is that ClickHouse processes data in chunks of unpredictable size, yet the training step of
|
||||
/// entropy-learned hashing needs to process *all* training data in one go. The downside is that the training step becomes quite expensive :-(
|
||||
class EntropyLearnedHashGlobalState
|
||||
{
|
||||
public:
|
||||
static EntropyLearnedHashGlobalState & instance()
|
||||
{
|
||||
static EntropyLearnedHashGlobalState instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
/// Called by prepareTrainEntropyLearnedHash()
|
||||
void cacheTrainingSample(const String & user_name, const String & id, IColumn::MutablePtr column)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto & ids_for_user = global_state[user_name];
|
||||
auto & training_samples_for_id = ids_for_user[id].training_samples;
|
||||
training_samples_for_id.push_back(std::move(column));
|
||||
}
|
||||
|
||||
void train(const String & user_name, const String & id)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto & ids_for_user = global_state[user_name];
|
||||
auto & training_samples = ids_for_user[id].training_samples;
|
||||
|
||||
if (training_samples.empty())
|
||||
return;
|
||||
|
||||
auto & concatenated_training_sample = training_samples[0];
|
||||
for (size_t i = 1; i < training_samples.size(); ++i)
|
||||
{
|
||||
auto & other_training_sample = training_samples[i];
|
||||
concatenated_training_sample->insertRangeFrom(*other_training_sample, 0, other_training_sample->size());
|
||||
}
|
||||
|
||||
const ColumnString * concatenated_training_sample_string = checkAndGetColumn<ColumnString>(*concatenated_training_sample);
|
||||
if (!concatenated_training_sample_string)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column");
|
||||
|
||||
const size_t num_rows = concatenated_training_sample_string->size();
|
||||
std::vector<std::string_view> training_data;
|
||||
for (size_t i = 0; i < num_rows; ++i)
|
||||
{
|
||||
std::string_view string_view = concatenated_training_sample_string->getDataAt(i).toView();
|
||||
training_data.emplace_back(string_view);
|
||||
}
|
||||
|
||||
PartialKeyPositions partial_key_positions = chooseBytes(training_data).first;
|
||||
|
||||
ids_for_user[id].partial_key_positions = partial_key_positions;
|
||||
training_samples.clear();
|
||||
}
|
||||
|
||||
const PartialKeyPositions & getPartialKeyPositions(const String & user_name, const String & id) const
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto it_user = global_state.find(user_name);
|
||||
if (it_user == global_state.end())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id);
|
||||
auto it_id = it_user->second.find(id);
|
||||
if (it_id == it_user->second.end())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id);
|
||||
return it_id->second.partial_key_positions;
|
||||
}
|
||||
|
||||
private:
|
||||
mutable std::mutex mutex;
|
||||
|
||||
/// The state.
|
||||
struct ColumnsAndPartialKeyPositions
|
||||
{
|
||||
/// Caches training data chunks. Filled by prepareTrainEntropyLearnedHash(), cleared by trainEntropyLearnedHash().
|
||||
MutableColumns training_samples;
|
||||
/// The result of the training phase. Filled by trainEntropyLearnedHash().
|
||||
PartialKeyPositions partial_key_positions;
|
||||
};
|
||||
|
||||
/// Maps a state id to the state.
|
||||
using IdToColumnsAndPartialKeyPositions = std::map<String, ColumnsAndPartialKeyPositions>;
|
||||
|
||||
/// Maps the user name to a state id. As a result, the state id is unique at user scope.
|
||||
using UserNameToId = std::map<String, IdToColumnsAndPartialKeyPositions>;
|
||||
|
||||
UserNameToId global_state TSA_GUARDED_BY(mutex);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Copies all chunks of the training sample column into the global state under a given id.
|
||||
class FunctionPrepareTrainEntropyLearnedHash : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "prepareTrainEntropyLearnedHash";
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettings().allow_experimental_hash_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it");
|
||||
|
||||
return std::make_shared<FunctionPrepareTrainEntropyLearnedHash>(context->getUserName());
|
||||
}
|
||||
explicit FunctionPrepareTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors args{
|
||||
{"data", &isString<IDataType>, nullptr, "String"},
|
||||
{"id", &isString<IDataType>, nullptr, "String"}
|
||||
};
|
||||
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override
|
||||
{
|
||||
const IColumn * id_col = arguments[1].column.get();
|
||||
const ColumnConst * id_col_const = checkAndGetColumn<ColumnConst>(id_col);
|
||||
const String id = id_col_const->getValue<String>();
|
||||
|
||||
IColumn::Ptr data_col = arguments[0].column;
|
||||
IColumn::MutablePtr data_col_mutable = IColumn::mutate(data_col);
|
||||
|
||||
auto & global_state = EntropyLearnedHashGlobalState::instance();
|
||||
global_state.cacheTrainingSample(user_name, id, std::move(data_col_mutable));
|
||||
|
||||
const size_t num_rows = data_col->size();
|
||||
return result_type->createColumnConst(num_rows, 0u); /// dummy output
|
||||
}
|
||||
private:
|
||||
const String user_name;
|
||||
};
|
||||
|
||||
|
||||
/// 1. Concatenates the training samples of a given id in the global state.
|
||||
/// 2. Computes the partial key positions from the concatenated training samples and stores that in the global state.
|
||||
/// 3. clear()-s the training samples in the global state.
|
||||
class FunctionTrainEntropyLearnedHash : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "trainEntropyLearnedHash";
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettings().allow_experimental_hash_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it");
|
||||
return std::make_shared<FunctionTrainEntropyLearnedHash>(context->getUserName());
|
||||
}
|
||||
explicit FunctionTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
bool useDefaultImplementationForConstants() const override { return false; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors args{
|
||||
{"id", &isString<IDataType>, nullptr, "String"}
|
||||
};
|
||||
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override
|
||||
{
|
||||
const IColumn * id_col = arguments[0].column.get();
|
||||
const ColumnConst * id_col_const = checkAndGetColumn<ColumnConst>(id_col);
|
||||
if (!id_col_const)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
|
||||
arguments.begin()->column->getName(), getName());
|
||||
|
||||
auto & global_state = EntropyLearnedHashGlobalState::instance();
|
||||
|
||||
const String id = id_col_const->getValue<String>();
|
||||
global_state.train(user_name, id);
|
||||
|
||||
const size_t num_rows = id_col->size();
|
||||
return result_type->createColumnConst(num_rows, 0u); /// dummy output
|
||||
}
|
||||
private:
|
||||
const String user_name;
|
||||
};
|
||||
|
||||
|
||||
/// Hashes input strings using partial key positions stored in the global state.
|
||||
class FunctionEntropyLearnedHash : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "entropyLearnedHash";
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettings().allow_experimental_hash_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Entropy-learned hashing experimental. Set `allow_experimental_hash_functions` setting to enable it");
|
||||
return std::make_shared<FunctionEntropyLearnedHash>(context->getUserName());
|
||||
}
|
||||
explicit FunctionEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors args{
|
||||
{"data", &isString<IDataType>, nullptr, "String"},
|
||||
{"id", &isString<IDataType>, nullptr, "String"}
|
||||
};
|
||||
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
|
||||
{
|
||||
const IColumn * id_col = arguments.back().column.get();
|
||||
const ColumnConst * id_col_const = checkAndGetColumn<ColumnConst>(id_col);
|
||||
const String id = id_col_const->getValue<String>();
|
||||
|
||||
const auto & global_state = EntropyLearnedHashGlobalState::instance();
|
||||
const auto & partial_key_positions = global_state.getPartialKeyPositions(user_name, id);
|
||||
|
||||
const auto * data_col = arguments[0].column.get();
|
||||
if (const auto * col_data_string = checkAndGetColumn<ColumnString>(data_col))
|
||||
{
|
||||
const size_t num_rows = col_data_string->size();
|
||||
auto col_res = ColumnUInt64::create(num_rows);
|
||||
|
||||
auto & col_res_vec = col_res->getData();
|
||||
String partial_key;
|
||||
for (size_t i = 0; i < num_rows; ++i)
|
||||
{
|
||||
std::string_view string_ref = col_data_string->getDataAt(i).toView();
|
||||
getPartialKey(string_ref, partial_key_positions, partial_key);
|
||||
col_res_vec[i] = CityHash_v1_0_2::CityHash64(partial_key.data(), partial_key.size());
|
||||
}
|
||||
|
||||
return col_res;
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
|
||||
arguments.begin()->column->getName(), getName());
|
||||
}
|
||||
private:
|
||||
const String user_name;
|
||||
};
|
||||
|
||||
REGISTER_FUNCTION(EntropyLearnedHash)
|
||||
{
|
||||
factory.registerFunction<FunctionPrepareTrainEntropyLearnedHash>();
|
||||
factory.registerFunction<FunctionTrainEntropyLearnedHash>();
|
||||
factory.registerFunction<FunctionEntropyLearnedHash>();
|
||||
}
|
||||
|
||||
}
|
@ -195,18 +195,14 @@ void WriteBufferFromS3::finalizeImpl()
|
||||
|
||||
if (request_settings.check_objects_after_upload)
|
||||
{
|
||||
LOG_TRACE(log, "Checking object {} exists after upload", key);
|
||||
S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
|
||||
|
||||
LOG_TRACE(log, "Checking object {} has size as expected {}", key, total_size);
|
||||
size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage);
|
||||
if (actual_size != total_size)
|
||||
throw Exception(
|
||||
ErrorCodes::S3_ERROR,
|
||||
"Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.",
|
||||
key, bucket, actual_size, total_size);
|
||||
|
||||
LOG_TRACE(log, "Object {} exists after upload", key);
|
||||
}
|
||||
}
|
||||
|
||||
@ -286,8 +282,6 @@ void WriteBufferFromS3::reallocateFirstBuffer()
|
||||
WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size);
|
||||
|
||||
chassert(offset() == 0);
|
||||
|
||||
LOG_TRACE(log, "Reallocated first buffer with size {}. {}", memory.size(), getLogDetails());
|
||||
}
|
||||
|
||||
void WriteBufferFromS3::detachBuffer()
|
||||
@ -310,8 +304,6 @@ void WriteBufferFromS3::allocateFirstBuffer()
|
||||
const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer);
|
||||
memory = Memory(size);
|
||||
WriteBuffer::set(memory.data(), memory.size());
|
||||
|
||||
LOG_TRACE(log, "Allocated first buffer with size {}. {}", memory.size(), getLogDetails());
|
||||
}
|
||||
|
||||
void WriteBufferFromS3::allocateBuffer()
|
||||
|
@ -36,8 +36,6 @@ ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
|
||||
|
||||
void WriteBufferFromS3::TaskTracker::waitAll()
|
||||
{
|
||||
LOG_TEST(log, "waitAll, in queue {}", futures.size());
|
||||
|
||||
/// Exceptions are propagated
|
||||
for (auto & future : futures)
|
||||
{
|
||||
@ -51,8 +49,6 @@ void WriteBufferFromS3::TaskTracker::waitAll()
|
||||
|
||||
void WriteBufferFromS3::TaskTracker::safeWaitAll()
|
||||
{
|
||||
LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size());
|
||||
|
||||
for (auto & future : futures)
|
||||
{
|
||||
if (future.valid())
|
||||
@ -76,7 +72,6 @@ void WriteBufferFromS3::TaskTracker::safeWaitAll()
|
||||
|
||||
void WriteBufferFromS3::TaskTracker::waitIfAny()
|
||||
{
|
||||
LOG_TEST(log, "waitIfAny, in queue {}", futures.size());
|
||||
if (futures.empty())
|
||||
return;
|
||||
|
||||
@ -101,8 +96,6 @@ void WriteBufferFromS3::TaskTracker::waitIfAny()
|
||||
|
||||
watch.stop();
|
||||
ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
|
||||
|
||||
LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size());
|
||||
}
|
||||
|
||||
void WriteBufferFromS3::TaskTracker::add(Callback && func)
|
||||
@ -147,8 +140,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
|
||||
if (!max_tasks_inflight)
|
||||
return;
|
||||
|
||||
LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size());
|
||||
|
||||
Stopwatch watch;
|
||||
|
||||
/// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
|
||||
@ -178,8 +169,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
|
||||
|
||||
watch.stop();
|
||||
ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
|
||||
|
||||
LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size());
|
||||
}
|
||||
|
||||
bool WriteBufferFromS3::TaskTracker::isAsync() const
|
||||
|
@ -266,6 +266,7 @@ encodeURLComponent
|
||||
encodeURLFormComponent
|
||||
encodeXMLComponent
|
||||
endsWith
|
||||
entropyLearnedHash
|
||||
equals
|
||||
erf
|
||||
erfc
|
||||
@ -558,6 +559,7 @@ positionCaseInsensitive
|
||||
positionCaseInsensitiveUTF8
|
||||
positionUTF8
|
||||
pow
|
||||
prepareTrainEntropyLearnedHash
|
||||
proportionsZTest
|
||||
protocol
|
||||
queryID
|
||||
@ -862,6 +864,7 @@ toYear
|
||||
toYearWeek
|
||||
today
|
||||
tokens
|
||||
trainEntropyLearnedHash
|
||||
transactionID
|
||||
transactionLatestSnapshot
|
||||
transactionOldestSnapshot
|
||||
|
@ -0,0 +1,18 @@
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
2603192927274642682
|
||||
4947675599669400333
|
||||
10783339242466472992
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
2603192927274642682
|
||||
4947675599669400333
|
||||
10783339242466472992
|
30
tests/queries/0_stateless/02734_entropy_learned_hashing.sql
Normal file
30
tests/queries/0_stateless/02734_entropy_learned_hashing.sql
Normal file
@ -0,0 +1,30 @@
|
||||
-- Tags: no-parallel
|
||||
-- no-parallel because entropy-learned hash uses global state
|
||||
|
||||
SET allow_experimental_hash_functions = 1;
|
||||
|
||||
-- no commonalities between keys
|
||||
DROP TABLE IF EXISTS tbl1;
|
||||
CREATE TABLE tbl1 (x String) ENGINE=Memory;
|
||||
INSERT INTO tbl1 VALUES ('a'), ('b'), ('c');
|
||||
SELECT prepareTrainEntropyLearnedHash(x, 'id1') FROM tbl1;
|
||||
SELECT trainEntropyLearnedHash('id1') FROM tbl1;
|
||||
SELECT entropyLearnedHash(x, 'id1') FROM tbl1;
|
||||
|
||||
-- with commonalities between keys
|
||||
DROP TABLE IF EXISTS tbl2;
|
||||
CREATE TABLE tbl2 (x String) ENGINE=Memory;
|
||||
INSERT INTO tbl2 VALUES ('aa'), ('ba'), ('ca');
|
||||
SELECT prepareTrainEntropyLearnedHash(x, 'id2') FROM tbl2;
|
||||
SELECT trainEntropyLearnedHash('id2') FROM tbl2;
|
||||
SELECT entropyLearnedHash(x, 'id2') FROM tbl2;
|
||||
|
||||
-- negative tests
|
||||
SELECT prepareTrainEntropyLearnedHash(x, 1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
SELECT prepareTrainEntropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
SELECT trainEntropyLearnedHash(1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
SELECT entropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
SELECT entropyLearnedHash(x, 'non-existing id') FROM tbl1; -- { serverError BAD_ARGUMENTS }
|
||||
|
||||
DROP TABLE tbl1;
|
||||
DROP TABLE tbl2;
|
@ -2,6 +2,7 @@ personal_ws-1.1 en 543
|
||||
AArch
|
||||
ACLs
|
||||
AMQP
|
||||
ARMv
|
||||
ASLR
|
||||
ASan
|
||||
Actian
|
||||
@ -12,6 +13,8 @@ AvroConfluent
|
||||
BSON
|
||||
BSONEachRow
|
||||
Bool
|
||||
BuilderBinAarch
|
||||
BuilderBinAmd
|
||||
CCTOOLS
|
||||
CLion
|
||||
CMake
|
||||
@ -27,6 +30,7 @@ ClickHouse's
|
||||
ClickableSquare
|
||||
CodeBlock
|
||||
CodeLLDB
|
||||
Compat
|
||||
Config
|
||||
ConnectionDetails
|
||||
Contrib
|
||||
@ -213,6 +217,7 @@ anonymized
|
||||
ansi
|
||||
async
|
||||
atomicity
|
||||
autogenerated
|
||||
autogeneration
|
||||
autostart
|
||||
avro
|
||||
@ -539,6 +544,7 @@ uint
|
||||
unary
|
||||
unencrypted
|
||||
unixodbc
|
||||
unoptimized
|
||||
url
|
||||
userspace
|
||||
userver
|
||||
|
Loading…
Reference in New Issue
Block a user