Merge pull request #68606 from yariks5s/remove_initial_underscores_hive

Prioritizing of virtual columns in hive partitioning
This commit is contained in:
Yarik Briukhovetskyi 2024-08-23 16:32:13 +00:00 committed by GitHub
commit 55116da80d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 155 additions and 232 deletions

View File

@ -444,8 +444,8 @@ StorageHive::StorageHive(
storage_metadata.setComment(comment_);
storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext());
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, getContext()));
setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), getContext()));
}
void StorageHive::lazyInitialize()

View File

@ -94,7 +94,7 @@ StorageObjectStorage::StorageObjectStorage(
if (sample_path.empty() && context->getSettingsRef().use_hive_partitioning)
sample_path = getPathSample(metadata, context);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path, format_settings));
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context, sample_path, format_settings));
setInMemoryMetadata(metadata);
}

View File

@ -68,7 +68,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster(
if (sample_path.empty() && context_->getSettingsRef().use_hive_partitioning)
sample_path = getPathSample(metadata, context_);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_, sample_path));
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context_, sample_path));
setInMemoryMetadata(metadata);
}

View File

@ -208,7 +208,7 @@ Chunk StorageObjectStorageSource::generate()
.filename = &filename,
.last_modified = object_info->metadata->last_modified,
.etag = &(object_info->metadata->etag)
}, getContext(), read_from_format_info.columns_description);
}, getContext());
const auto & partition_columns = configuration->getPartitionColumns();
if (!partition_columns.empty() && chunk_size && chunk.hasColumns())
@ -280,7 +280,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
const std::shared_ptr<IIterator> & file_iterator,
const ConfigurationPtr & configuration,
const ObjectStoragePtr & object_storage,
const ReadFromFormatInfo & read_from_format_info,
ReadFromFormatInfo & read_from_format_info,
const std::optional<FormatSettings> & format_settings,
const std::shared_ptr<const KeyCondition> & key_condition_,
const ContextPtr & context_,

View File

@ -74,7 +74,7 @@ protected:
const UInt64 max_block_size;
const bool need_only_count;
const size_t max_parsing_threads;
const ReadFromFormatInfo read_from_format_info;
ReadFromFormatInfo read_from_format_info;
const std::shared_ptr<ThreadPool> create_reader_pool;
std::shared_ptr<IIterator> file_iterator;
@ -122,7 +122,7 @@ protected:
const std::shared_ptr<IIterator> & file_iterator,
const ConfigurationPtr & configuration,
const ObjectStoragePtr & object_storage,
const ReadFromFormatInfo & read_from_format_info,
ReadFromFormatInfo & read_from_format_info,
const std::optional<FormatSettings> & format_settings,
const std::shared_ptr<const KeyCondition> & key_condition_,
const ContextPtr & context_,

View File

@ -524,7 +524,7 @@ Chunk ObjectStorageQueueSource::generateImpl()
{
.path = path,
.size = reader.getObjectInfo()->metadata->size_bytes
}, getContext(), read_from_format_info.columns_description);
}, getContext());
return chunk;
}

View File

@ -128,7 +128,7 @@ private:
const std::shared_ptr<FileIterator> file_iterator;
const ConfigurationPtr configuration;
const ObjectStoragePtr object_storage;
const ReadFromFormatInfo read_from_format_info;
ReadFromFormatInfo read_from_format_info;
const std::optional<FormatSettings> format_settings;
const ObjectStorageQueueSettings queue_settings;
const std::shared_ptr<ObjectStorageQueueMetadata> files_metadata;

View File

@ -169,7 +169,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
storage_metadata.setColumns(columns);
storage_metadata.setConstraints(constraints_);
storage_metadata.setComment(comment);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_));
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context_));
setInMemoryMetadata(storage_metadata);
LOG_INFO(log, "Using zookeeper path: {}", zk_path.string());

View File

@ -1112,9 +1112,9 @@ void StorageFile::setStorageMetadata(CommonArguments args)
storage_metadata.setConstraints(args.constraints);
storage_metadata.setComment(args.comment);
setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), paths.empty() ? "" : paths[0], format_settings));
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, args.getContext(), paths.empty() ? "" : paths[0], format_settings));
setInMemoryMetadata(storage_metadata);
}
@ -1468,7 +1468,7 @@ Chunk StorageFileSource::generate()
.size = current_file_size,
.filename = (filename_override.has_value() ? &filename_override.value() : nullptr),
.last_modified = current_file_last_modified
}, getContext(), columns_description);
}, getContext());
return chunk;
}

View File

@ -60,8 +60,8 @@ StorageFileCluster::StorageFileCluster(
}
storage_metadata.setConstraints(constraints_);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, paths.empty() ? "" : paths[0]));
setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, paths.empty() ? "" : paths[0]));
}
void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context)

View File

@ -165,9 +165,9 @@ IStorageURLBase::IStorageURLBase(
storage_metadata.setConstraints(constraints_);
storage_metadata.setComment(comment);
setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, getSampleURI(uri, context_), format_settings));
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context_, getSampleURI(uri, context_), format_settings));
setInMemoryMetadata(storage_metadata);
}
@ -435,7 +435,7 @@ Chunk StorageURLSource::generate()
{
.path = curr_uri.getPath(),
.size = current_file_size,
}, getContext(), columns_description);
}, getContext());
return chunk;
}

View File

@ -75,8 +75,8 @@ StorageURLCluster::StorageURLCluster(
}
storage_metadata.setConstraints(constraints_);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, getSampleURI(uri, context)));
setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, getSampleURI(uri, context)));
}
void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context)

View File

@ -129,36 +129,45 @@ NameSet getVirtualNamesForFileLikeStorage()
return {"_path", "_file", "_size", "_time", "_etag"};
}
std::unordered_map<std::string, std::string> parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns)
std::unordered_map<std::string, std::string> parseHivePartitioningKeysAndValues(const String & path)
{
std::string pattern = "([^/]+)=([^/]+)/";
re2::StringPiece input_piece(path);
std::unordered_map<std::string, std::string> key_values;
std::string key, value;
std::unordered_set<String> used_keys;
std::unordered_map<std::string, std::string> used_keys;
while (RE2::FindAndConsume(&input_piece, pattern, &key, &value))
{
if (used_keys.contains(key))
throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {}, only unique keys are allowed", path, key);
used_keys.insert(key);
auto it = used_keys.find(key);
if (it != used_keys.end() && it->second != value)
throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, key);
used_keys.insert({key, value});
auto col_name = "_" + key;
while (storage_columns.has(col_name))
col_name = "_" + col_name;
auto col_name = key;
key_values[col_name] = value;
}
return key_values;
}
VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, const std::string & path, std::optional<FormatSettings> format_settings_)
VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & storage_columns, const ContextPtr & context, const std::string & path, std::optional<FormatSettings> format_settings_)
{
VirtualColumnsDescription desc;
auto add_virtual = [&](const auto & name, const auto & type)
{
if (storage_columns.has(name))
{
if (!context->getSettingsRef().use_hive_partitioning)
return;
if (storage_columns.size() == 1)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot use hive partitioning for file {}: it contains only partition columns. Disable use_hive_partitioning setting to read this file", path);
auto local_type = storage_columns.get(name).type;
storage_columns.remove(name);
desc.addEphemeral(name, local_type, "");
return;
}
desc.addEphemeral(name, type, "");
};
@ -171,7 +180,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription
if (context->getSettingsRef().use_hive_partitioning)
{
auto map = parseHivePartitioningKeysAndValues(path, storage_columns);
auto map = parseHivePartitioningKeysAndValues(path);
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
for (auto & item : map)
{
@ -244,11 +253,11 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const
void addRequestedFileLikeStorageVirtualsToChunk(
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns)
VirtualsForFileLikeStorage virtual_values, ContextPtr context)
{
std::unordered_map<std::string, std::string> hive_map;
if (context->getSettingsRef().use_hive_partitioning)
hive_map = parseHivePartitioningKeysAndValues(virtual_values.path, columns);
hive_map = parseHivePartitioningKeysAndValues(virtual_values.path);
for (const auto & virtual_column : requested_virtual_columns)
{

View File

@ -70,7 +70,7 @@ auto extractSingleValueFromBlock(const Block & block, const String & name)
NameSet getVirtualNamesForFileLikeStorage();
VirtualColumnsDescription getVirtualsForFileLikeStorage(
const ColumnsDescription & storage_columns,
ColumnsDescription & storage_columns,
const ContextPtr & context,
const std::string & sample_path = "",
std::optional<FormatSettings> format_settings_ = std::nullopt);
@ -105,7 +105,7 @@ struct VirtualsForFileLikeStorage
void addRequestedFileLikeStorageVirtualsToChunk(
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns);
VirtualsForFileLikeStorage virtual_values, ContextPtr context);
}
}

View File

@ -1513,19 +1513,19 @@ def test_hive_partitioning_with_one_parameter(cluster):
azure_query(
node,
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}",
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values}",
settings={"azure_truncate_on_insert": 1},
)
query = (
f"SELECT column1, column2, _file, _path, _column1 FROM azureBlobStorage(azure_conf2, "
f"SELECT column2, _file, _path, column1 FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}')"
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}')"
)
assert azure_query(
node, query, settings={"use_hive_partitioning": 1}
).splitlines() == [
"Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format(
"Gordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format(
bucket="cont", max_path=path
)
]
@ -1533,14 +1533,14 @@ def test_hive_partitioning_with_one_parameter(cluster):
query = (
f"SELECT column2 FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
)
assert azure_query(
node, query, settings={"use_hive_partitioning": 1}
).splitlines() == ["Gordon"]
def test_hive_partitioning_with_two_parameters(cluster):
def test_hive_partitioning_with_all_parameters(cluster):
# type: (ClickHouseCluster) -> None
node = cluster.instances["node"] # type: ClickHouseInstance
table_format = "column1 String, column2 String"
@ -1551,40 +1551,19 @@ def test_hive_partitioning_with_two_parameters(cluster):
azure_query(
node,
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
settings={"azure_truncate_on_insert": 1},
)
query = (
f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, "
f"SELECT column1, column2, _file, _path FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
)
assert azure_query(
node, query, settings={"use_hive_partitioning": 1}
).splitlines() == [
"Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format(
bucket="cont", max_path=path
)
]
pattern = r"DB::Exception: Cannot use hive partitioning for file"
query = (
f"SELECT column1 FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;"
)
assert azure_query(
node, query, settings={"use_hive_partitioning": 1}
).splitlines() == ["Elizabeth"]
query = (
f"SELECT column1 FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;"
)
assert azure_query(
node, query, settings={"use_hive_partitioning": 1}
).splitlines() == ["Elizabeth"]
with pytest.raises(Exception, match=pattern):
azure_query(node, query, settings={"use_hive_partitioning": 1})
def test_hive_partitioning_without_setting(cluster):
@ -1593,19 +1572,19 @@ def test_hive_partitioning_without_setting(cluster):
table_format = "column1 String, column2 String"
values_1 = f"('Elizabeth', 'Gordon')"
values_2 = f"('Emilia', 'Gregor')"
path = "a/column1=Elizabeth/column2=Gordon/sample.csv"
path = "a/column1=Elizabeth/column2=Gordon/column3=Gordon/sample.csv"
azure_query(
node,
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
settings={"azure_truncate_on_insert": 1},
)
query = (
f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, "
f"SELECT column1, column2, _file, _path, column3 FROM azureBlobStorage(azure_conf2, "
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
)
pattern = re.compile(
r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL

View File

@ -1259,33 +1259,21 @@ def test_respect_object_existence_on_partitioned_write(started_cluster):
def test_hive_partitioning_with_one_parameter(started_cluster):
hdfs_api = started_cluster.hdfs_api
hdfs_api.write_data(f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n")
assert hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") == f"Elizabeth\tGordon\n"
hdfs_api.write_data(
f"/column0=Elizabeth/file_1", f"column0,column1\nElizabeth,Gordon\n"
)
assert (
hdfs_api.read_data(f"/column0=Elizabeth/file_1")
== f"column0,column1\nElizabeth,Gordon\n"
)
r = node1.query(
"SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')",
"SELECT column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/file_1', 'CSVWithNames')",
settings={"use_hive_partitioning": 1},
)
assert r == f"Elizabeth\n"
def test_hive_partitioning_with_two_parameters(started_cluster):
hdfs_api = started_cluster.hdfs_api
hdfs_api.write_data(
f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n"
)
assert (
hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2")
== f"Elizabeth\tGordon\n"
)
r = node1.query(
"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
settings={"use_hive_partitioning": 1},
)
assert r == f"Gordon\n"
def test_hive_partitioning_without_setting(started_cluster):
hdfs_api = started_cluster.hdfs_api
hdfs_api.write_data(
@ -1301,7 +1289,7 @@ def test_hive_partitioning_without_setting(started_cluster):
with pytest.raises(QueryRuntimeException, match=pattern):
node1.query(
f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
f"SELECT column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
settings={"use_hive_partitioning": 0},
)

View File

@ -1,4 +1,14 @@
TESTING THE FILE HIVE PARTITIONING
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
@ -9,56 +19,36 @@ Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
Eva Schmidt Elizabeth Schmidt
Samuel Schmidt Elizabeth Schmidt
Eva Schmidt Elizabeth
Samuel Schmidt Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
Esther Guzman Elizabeth
Dennis Stephens Elizabeth
Nettie Franklin Elizabeth
Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
42 2020-01-01
[1,2,3] 42.42
Array(Int64) LowCardinality(Float64)
101
2070
4081
2070
2070
b
1
1
TESTING THE URL PARTITIONING
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
Esther Guzman Elizabeth
Dennis Stephens Elizabeth
Nettie Franklin Elizabeth
Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
Eva Schmidt Elizabeth Schmidt
Samuel Schmidt Elizabeth Schmidt
Eva Schmidt Elizabeth
Samuel Schmidt Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
@ -71,6 +61,16 @@ Jeffery Delgado Elizabeth
Clara Cross Elizabeth
1
TESTING THE S3 PARTITIONING
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
@ -81,40 +81,35 @@ Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
Eva Schmidt Elizabeth Schmidt
Samuel Schmidt Elizabeth Schmidt
Eva Schmidt Elizabeth
Samuel Schmidt Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
Elizabeth Gordon Elizabeth Gordon
Elizabeth Gordon Elizabeth
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
Esther Guzman Elizabeth
Dennis Stephens Elizabeth
Nettie Franklin Elizabeth
Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
OK
TESTING THE S3CLUSTER PARTITIONING
first last Elizabeth
Jorge Frank Elizabeth
Hunter Moreno Elizabeth
Esther Guzman Elizabeth
Dennis Stephens Elizabeth
Nettie Franklin Elizabeth
Stanley Gibson Elizabeth
Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth
Clara Cross Elizabeth
Elizabeth Gordon Elizabeth
Eva Schmidt Elizabeth Schmidt
Samuel Schmidt Elizabeth Schmidt
Eva Schmidt Elizabeth
Samuel Schmidt Elizabeth
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth
last Elizabeth
Frank Elizabeth
Moreno Elizabeth
Guzman Elizabeth
Stephens Elizabeth
Franklin Elizabeth
Gibson Elizabeth
Greer Elizabeth
Delgado Elizabeth
Cross Elizabeth

View File

@ -11,48 +11,34 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'"
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
SELECT *, non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;
SELECT _number, _date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1;
SELECT _array, _float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42;
SELECT number, date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1;
SELECT array, float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT toTypeName(array), toTypeName(float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE number = 42;
"""
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 1;
SELECT _identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
SELECT __identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
SELECT identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
SELECT a FROM file('$CURDIR/data_hive/partitioning/a=b/a=b/sample.parquet') LIMIT 1;
"""
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 1;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth1/sample.parquet') LIMIT 10;
""" 2>&1 | grep -c "INCORRECT_DATA"
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 0;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER"
@ -62,23 +48,9 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'"
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;"""
SELECT *, non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;"""
$CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 0;
@ -93,24 +65,10 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'"
$CLICKHOUSE_CLIENT -n -q """
set use_hive_partitioning = 1;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0;
SELECT *, non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
"""
$CLICKHOUSE_CLIENT -n -q """
@ -124,13 +82,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3CLUSTER PARTITIONING'"
$CLICKHOUSE_CLIENT -n -q """
set use_hive_partitioning = 1;
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
"""