mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge pull request #68606 from yariks5s/remove_initial_underscores_hive
Prioritizing of virtual columns in hive partitioning
This commit is contained in:
commit
55116da80d
@ -444,8 +444,8 @@ StorageHive::StorageHive(
|
||||
storage_metadata.setComment(comment_);
|
||||
storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext());
|
||||
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, getContext()));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), getContext()));
|
||||
}
|
||||
|
||||
void StorageHive::lazyInitialize()
|
||||
|
@ -94,7 +94,7 @@ StorageObjectStorage::StorageObjectStorage(
|
||||
if (sample_path.empty() && context->getSettingsRef().use_hive_partitioning)
|
||||
sample_path = getPathSample(metadata, context);
|
||||
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path, format_settings));
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context, sample_path, format_settings));
|
||||
setInMemoryMetadata(metadata);
|
||||
}
|
||||
|
||||
|
@ -68,7 +68,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster(
|
||||
if (sample_path.empty() && context_->getSettingsRef().use_hive_partitioning)
|
||||
sample_path = getPathSample(metadata, context_);
|
||||
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_, sample_path));
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context_, sample_path));
|
||||
setInMemoryMetadata(metadata);
|
||||
}
|
||||
|
||||
|
@ -208,7 +208,7 @@ Chunk StorageObjectStorageSource::generate()
|
||||
.filename = &filename,
|
||||
.last_modified = object_info->metadata->last_modified,
|
||||
.etag = &(object_info->metadata->etag)
|
||||
}, getContext(), read_from_format_info.columns_description);
|
||||
}, getContext());
|
||||
|
||||
const auto & partition_columns = configuration->getPartitionColumns();
|
||||
if (!partition_columns.empty() && chunk_size && chunk.hasColumns())
|
||||
@ -280,7 +280,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
|
||||
const std::shared_ptr<IIterator> & file_iterator,
|
||||
const ConfigurationPtr & configuration,
|
||||
const ObjectStoragePtr & object_storage,
|
||||
const ReadFromFormatInfo & read_from_format_info,
|
||||
ReadFromFormatInfo & read_from_format_info,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
const std::shared_ptr<const KeyCondition> & key_condition_,
|
||||
const ContextPtr & context_,
|
||||
|
@ -74,7 +74,7 @@ protected:
|
||||
const UInt64 max_block_size;
|
||||
const bool need_only_count;
|
||||
const size_t max_parsing_threads;
|
||||
const ReadFromFormatInfo read_from_format_info;
|
||||
ReadFromFormatInfo read_from_format_info;
|
||||
const std::shared_ptr<ThreadPool> create_reader_pool;
|
||||
|
||||
std::shared_ptr<IIterator> file_iterator;
|
||||
@ -122,7 +122,7 @@ protected:
|
||||
const std::shared_ptr<IIterator> & file_iterator,
|
||||
const ConfigurationPtr & configuration,
|
||||
const ObjectStoragePtr & object_storage,
|
||||
const ReadFromFormatInfo & read_from_format_info,
|
||||
ReadFromFormatInfo & read_from_format_info,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
const std::shared_ptr<const KeyCondition> & key_condition_,
|
||||
const ContextPtr & context_,
|
||||
|
@ -524,7 +524,7 @@ Chunk ObjectStorageQueueSource::generateImpl()
|
||||
{
|
||||
.path = path,
|
||||
.size = reader.getObjectInfo()->metadata->size_bytes
|
||||
}, getContext(), read_from_format_info.columns_description);
|
||||
}, getContext());
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
@ -128,7 +128,7 @@ private:
|
||||
const std::shared_ptr<FileIterator> file_iterator;
|
||||
const ConfigurationPtr configuration;
|
||||
const ObjectStoragePtr object_storage;
|
||||
const ReadFromFormatInfo read_from_format_info;
|
||||
ReadFromFormatInfo read_from_format_info;
|
||||
const std::optional<FormatSettings> format_settings;
|
||||
const ObjectStorageQueueSettings queue_settings;
|
||||
const std::shared_ptr<ObjectStorageQueueMetadata> files_metadata;
|
||||
|
@ -169,7 +169,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
|
||||
storage_metadata.setColumns(columns);
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
storage_metadata.setComment(comment);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_));
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context_));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
LOG_INFO(log, "Using zookeeper path: {}", zk_path.string());
|
||||
|
@ -1112,9 +1112,9 @@ void StorageFile::setStorageMetadata(CommonArguments args)
|
||||
|
||||
storage_metadata.setConstraints(args.constraints);
|
||||
storage_metadata.setComment(args.comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), paths.empty() ? "" : paths[0], format_settings));
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, args.getContext(), paths.empty() ? "" : paths[0], format_settings));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
}
|
||||
|
||||
|
||||
@ -1468,7 +1468,7 @@ Chunk StorageFileSource::generate()
|
||||
.size = current_file_size,
|
||||
.filename = (filename_override.has_value() ? &filename_override.value() : nullptr),
|
||||
.last_modified = current_file_last_modified
|
||||
}, getContext(), columns_description);
|
||||
}, getContext());
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
@ -60,8 +60,8 @@ StorageFileCluster::StorageFileCluster(
|
||||
}
|
||||
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, paths.empty() ? "" : paths[0]));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, paths.empty() ? "" : paths[0]));
|
||||
}
|
||||
|
||||
void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context)
|
||||
|
@ -165,9 +165,9 @@ IStorageURLBase::IStorageURLBase(
|
||||
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
storage_metadata.setComment(comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, getSampleURI(uri, context_), format_settings));
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context_, getSampleURI(uri, context_), format_settings));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
}
|
||||
|
||||
|
||||
@ -435,7 +435,7 @@ Chunk StorageURLSource::generate()
|
||||
{
|
||||
.path = curr_uri.getPath(),
|
||||
.size = current_file_size,
|
||||
}, getContext(), columns_description);
|
||||
}, getContext());
|
||||
return chunk;
|
||||
}
|
||||
|
||||
|
@ -75,8 +75,8 @@ StorageURLCluster::StorageURLCluster(
|
||||
}
|
||||
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, getSampleURI(uri, context)));
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, getSampleURI(uri, context)));
|
||||
}
|
||||
|
||||
void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context)
|
||||
|
@ -129,36 +129,45 @@ NameSet getVirtualNamesForFileLikeStorage()
|
||||
return {"_path", "_file", "_size", "_time", "_etag"};
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns)
|
||||
std::unordered_map<std::string, std::string> parseHivePartitioningKeysAndValues(const String & path)
|
||||
{
|
||||
std::string pattern = "([^/]+)=([^/]+)/";
|
||||
re2::StringPiece input_piece(path);
|
||||
|
||||
std::unordered_map<std::string, std::string> key_values;
|
||||
std::string key, value;
|
||||
std::unordered_set<String> used_keys;
|
||||
std::unordered_map<std::string, std::string> used_keys;
|
||||
while (RE2::FindAndConsume(&input_piece, pattern, &key, &value))
|
||||
{
|
||||
if (used_keys.contains(key))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {}, only unique keys are allowed", path, key);
|
||||
used_keys.insert(key);
|
||||
auto it = used_keys.find(key);
|
||||
if (it != used_keys.end() && it->second != value)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, key);
|
||||
used_keys.insert({key, value});
|
||||
|
||||
auto col_name = "_" + key;
|
||||
while (storage_columns.has(col_name))
|
||||
col_name = "_" + col_name;
|
||||
auto col_name = key;
|
||||
key_values[col_name] = value;
|
||||
}
|
||||
return key_values;
|
||||
}
|
||||
|
||||
VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, const std::string & path, std::optional<FormatSettings> format_settings_)
|
||||
VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & storage_columns, const ContextPtr & context, const std::string & path, std::optional<FormatSettings> format_settings_)
|
||||
{
|
||||
VirtualColumnsDescription desc;
|
||||
|
||||
auto add_virtual = [&](const auto & name, const auto & type)
|
||||
{
|
||||
if (storage_columns.has(name))
|
||||
{
|
||||
if (!context->getSettingsRef().use_hive_partitioning)
|
||||
return;
|
||||
|
||||
if (storage_columns.size() == 1)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot use hive partitioning for file {}: it contains only partition columns. Disable use_hive_partitioning setting to read this file", path);
|
||||
auto local_type = storage_columns.get(name).type;
|
||||
storage_columns.remove(name);
|
||||
desc.addEphemeral(name, local_type, "");
|
||||
return;
|
||||
}
|
||||
|
||||
desc.addEphemeral(name, type, "");
|
||||
};
|
||||
@ -171,7 +180,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription
|
||||
|
||||
if (context->getSettingsRef().use_hive_partitioning)
|
||||
{
|
||||
auto map = parseHivePartitioningKeysAndValues(path, storage_columns);
|
||||
auto map = parseHivePartitioningKeysAndValues(path);
|
||||
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
|
||||
for (auto & item : map)
|
||||
{
|
||||
@ -244,11 +253,11 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const
|
||||
|
||||
void addRequestedFileLikeStorageVirtualsToChunk(
|
||||
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
|
||||
VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns)
|
||||
VirtualsForFileLikeStorage virtual_values, ContextPtr context)
|
||||
{
|
||||
std::unordered_map<std::string, std::string> hive_map;
|
||||
if (context->getSettingsRef().use_hive_partitioning)
|
||||
hive_map = parseHivePartitioningKeysAndValues(virtual_values.path, columns);
|
||||
hive_map = parseHivePartitioningKeysAndValues(virtual_values.path);
|
||||
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
|
@ -70,7 +70,7 @@ auto extractSingleValueFromBlock(const Block & block, const String & name)
|
||||
|
||||
NameSet getVirtualNamesForFileLikeStorage();
|
||||
VirtualColumnsDescription getVirtualsForFileLikeStorage(
|
||||
const ColumnsDescription & storage_columns,
|
||||
ColumnsDescription & storage_columns,
|
||||
const ContextPtr & context,
|
||||
const std::string & sample_path = "",
|
||||
std::optional<FormatSettings> format_settings_ = std::nullopt);
|
||||
@ -105,7 +105,7 @@ struct VirtualsForFileLikeStorage
|
||||
|
||||
void addRequestedFileLikeStorageVirtualsToChunk(
|
||||
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
|
||||
VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns);
|
||||
VirtualsForFileLikeStorage virtual_values, ContextPtr context);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1513,19 +1513,19 @@ def test_hive_partitioning_with_one_parameter(cluster):
|
||||
azure_query(
|
||||
node,
|
||||
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
|
||||
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}",
|
||||
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values}",
|
||||
settings={"azure_truncate_on_insert": 1},
|
||||
)
|
||||
|
||||
query = (
|
||||
f"SELECT column1, column2, _file, _path, _column1 FROM azureBlobStorage(azure_conf2, "
|
||||
f"SELECT column2, _file, _path, column1 FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}')"
|
||||
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}')"
|
||||
)
|
||||
assert azure_query(
|
||||
node, query, settings={"use_hive_partitioning": 1}
|
||||
).splitlines() == [
|
||||
"Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format(
|
||||
"Gordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format(
|
||||
bucket="cont", max_path=path
|
||||
)
|
||||
]
|
||||
@ -1533,14 +1533,14 @@ def test_hive_partitioning_with_one_parameter(cluster):
|
||||
query = (
|
||||
f"SELECT column2 FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
|
||||
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
|
||||
)
|
||||
assert azure_query(
|
||||
node, query, settings={"use_hive_partitioning": 1}
|
||||
).splitlines() == ["Gordon"]
|
||||
|
||||
|
||||
def test_hive_partitioning_with_two_parameters(cluster):
|
||||
def test_hive_partitioning_with_all_parameters(cluster):
|
||||
# type: (ClickHouseCluster) -> None
|
||||
node = cluster.instances["node"] # type: ClickHouseInstance
|
||||
table_format = "column1 String, column2 String"
|
||||
@ -1551,40 +1551,19 @@ def test_hive_partitioning_with_two_parameters(cluster):
|
||||
azure_query(
|
||||
node,
|
||||
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
|
||||
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
|
||||
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
|
||||
settings={"azure_truncate_on_insert": 1},
|
||||
)
|
||||
|
||||
query = (
|
||||
f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, "
|
||||
f"SELECT column1, column2, _file, _path FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
|
||||
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
|
||||
)
|
||||
assert azure_query(
|
||||
node, query, settings={"use_hive_partitioning": 1}
|
||||
).splitlines() == [
|
||||
"Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format(
|
||||
bucket="cont", max_path=path
|
||||
)
|
||||
]
|
||||
pattern = r"DB::Exception: Cannot use hive partitioning for file"
|
||||
|
||||
query = (
|
||||
f"SELECT column1 FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;"
|
||||
)
|
||||
assert azure_query(
|
||||
node, query, settings={"use_hive_partitioning": 1}
|
||||
).splitlines() == ["Elizabeth"]
|
||||
|
||||
query = (
|
||||
f"SELECT column1 FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;"
|
||||
)
|
||||
assert azure_query(
|
||||
node, query, settings={"use_hive_partitioning": 1}
|
||||
).splitlines() == ["Elizabeth"]
|
||||
with pytest.raises(Exception, match=pattern):
|
||||
azure_query(node, query, settings={"use_hive_partitioning": 1})
|
||||
|
||||
|
||||
def test_hive_partitioning_without_setting(cluster):
|
||||
@ -1593,19 +1572,19 @@ def test_hive_partitioning_without_setting(cluster):
|
||||
table_format = "column1 String, column2 String"
|
||||
values_1 = f"('Elizabeth', 'Gordon')"
|
||||
values_2 = f"('Emilia', 'Gregor')"
|
||||
path = "a/column1=Elizabeth/column2=Gordon/sample.csv"
|
||||
path = "a/column1=Elizabeth/column2=Gordon/column3=Gordon/sample.csv"
|
||||
|
||||
azure_query(
|
||||
node,
|
||||
f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}',"
|
||||
f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
|
||||
f" container='cont', blob_path='{path}', format='CSVWithNames', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}",
|
||||
settings={"azure_truncate_on_insert": 1},
|
||||
)
|
||||
|
||||
query = (
|
||||
f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, "
|
||||
f"SELECT column1, column2, _file, _path, column3 FROM azureBlobStorage(azure_conf2, "
|
||||
f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', "
|
||||
f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;"
|
||||
f"blob_path='{path}', format='CSVWithNames', structure='{table_format}');"
|
||||
)
|
||||
pattern = re.compile(
|
||||
r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL
|
||||
|
@ -1259,33 +1259,21 @@ def test_respect_object_existence_on_partitioned_write(started_cluster):
|
||||
|
||||
def test_hive_partitioning_with_one_parameter(started_cluster):
|
||||
hdfs_api = started_cluster.hdfs_api
|
||||
hdfs_api.write_data(f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n")
|
||||
assert hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") == f"Elizabeth\tGordon\n"
|
||||
hdfs_api.write_data(
|
||||
f"/column0=Elizabeth/file_1", f"column0,column1\nElizabeth,Gordon\n"
|
||||
)
|
||||
assert (
|
||||
hdfs_api.read_data(f"/column0=Elizabeth/file_1")
|
||||
== f"column0,column1\nElizabeth,Gordon\n"
|
||||
)
|
||||
|
||||
r = node1.query(
|
||||
"SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')",
|
||||
"SELECT column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/file_1', 'CSVWithNames')",
|
||||
settings={"use_hive_partitioning": 1},
|
||||
)
|
||||
assert r == f"Elizabeth\n"
|
||||
|
||||
|
||||
def test_hive_partitioning_with_two_parameters(started_cluster):
|
||||
hdfs_api = started_cluster.hdfs_api
|
||||
hdfs_api.write_data(
|
||||
f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n"
|
||||
)
|
||||
assert (
|
||||
hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2")
|
||||
== f"Elizabeth\tGordon\n"
|
||||
)
|
||||
|
||||
r = node1.query(
|
||||
"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
|
||||
settings={"use_hive_partitioning": 1},
|
||||
)
|
||||
assert r == f"Gordon\n"
|
||||
|
||||
|
||||
def test_hive_partitioning_without_setting(started_cluster):
|
||||
hdfs_api = started_cluster.hdfs_api
|
||||
hdfs_api.write_data(
|
||||
@ -1301,7 +1289,7 @@ def test_hive_partitioning_without_setting(started_cluster):
|
||||
|
||||
with pytest.raises(QueryRuntimeException, match=pattern):
|
||||
node1.query(
|
||||
f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
|
||||
f"SELECT column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');",
|
||||
settings={"use_hive_partitioning": 0},
|
||||
)
|
||||
|
||||
|
@ -1,4 +1,14 @@
|
||||
TESTING THE FILE HIVE PARTITIONING
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
@ -9,56 +19,36 @@ Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
Eva Schmidt Elizabeth Schmidt
|
||||
Samuel Schmidt Elizabeth Schmidt
|
||||
Eva Schmidt Elizabeth
|
||||
Samuel Schmidt Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
Esther Guzman Elizabeth
|
||||
Dennis Stephens Elizabeth
|
||||
Nettie Franklin Elizabeth
|
||||
Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
42 2020-01-01
|
||||
[1,2,3] 42.42
|
||||
Array(Int64) LowCardinality(Float64)
|
||||
101
|
||||
2070
|
||||
4081
|
||||
2070
|
||||
2070
|
||||
b
|
||||
1
|
||||
1
|
||||
TESTING THE URL PARTITIONING
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
Esther Guzman Elizabeth
|
||||
Dennis Stephens Elizabeth
|
||||
Nettie Franklin Elizabeth
|
||||
Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
Eva Schmidt Elizabeth Schmidt
|
||||
Samuel Schmidt Elizabeth Schmidt
|
||||
Eva Schmidt Elizabeth
|
||||
Samuel Schmidt Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
@ -71,6 +61,16 @@ Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
1
|
||||
TESTING THE S3 PARTITIONING
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
@ -81,40 +81,35 @@ Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
Eva Schmidt Elizabeth Schmidt
|
||||
Samuel Schmidt Elizabeth Schmidt
|
||||
Eva Schmidt Elizabeth
|
||||
Samuel Schmidt Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
Elizabeth Gordon Elizabeth Gordon
|
||||
Elizabeth Gordon Elizabeth
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
Esther Guzman Elizabeth
|
||||
Dennis Stephens Elizabeth
|
||||
Nettie Franklin Elizabeth
|
||||
Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
OK
|
||||
TESTING THE S3CLUSTER PARTITIONING
|
||||
first last Elizabeth
|
||||
Jorge Frank Elizabeth
|
||||
Hunter Moreno Elizabeth
|
||||
Esther Guzman Elizabeth
|
||||
Dennis Stephens Elizabeth
|
||||
Nettie Franklin Elizabeth
|
||||
Stanley Gibson Elizabeth
|
||||
Eugenia Greer Elizabeth
|
||||
Jeffery Delgado Elizabeth
|
||||
Clara Cross Elizabeth
|
||||
Elizabeth Gordon Elizabeth
|
||||
Eva Schmidt Elizabeth Schmidt
|
||||
Samuel Schmidt Elizabeth Schmidt
|
||||
Eva Schmidt Elizabeth
|
||||
Samuel Schmidt Elizabeth
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
last Elizabeth
|
||||
Frank Elizabeth
|
||||
Moreno Elizabeth
|
||||
Guzman Elizabeth
|
||||
Stephens Elizabeth
|
||||
Franklin Elizabeth
|
||||
Gibson Elizabeth
|
||||
Greer Elizabeth
|
||||
Delgado Elizabeth
|
||||
Cross Elizabeth
|
||||
|
@ -11,48 +11,34 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'"
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
|
||||
SELECT *, non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
|
||||
|
||||
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;
|
||||
|
||||
SELECT _number, _date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1;
|
||||
SELECT _array, _float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
|
||||
SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
|
||||
SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42;
|
||||
SELECT number, date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1;
|
||||
SELECT array, float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
|
||||
SELECT toTypeName(array), toTypeName(float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
|
||||
SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE number = 42;
|
||||
"""
|
||||
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT _identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
|
||||
SELECT __identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
|
||||
SELECT identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2;
|
||||
SELECT a FROM file('$CURDIR/data_hive/partitioning/a=b/a=b/sample.parquet') LIMIT 1;
|
||||
"""
|
||||
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth1/sample.parquet') LIMIT 10;
|
||||
""" 2>&1 | grep -c "INCORRECT_DATA"
|
||||
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 0;
|
||||
|
||||
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
|
||||
""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER"
|
||||
|
||||
|
||||
@ -62,23 +48,9 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'"
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
|
||||
|
||||
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;"""
|
||||
SELECT *, non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;"""
|
||||
|
||||
$CLICKHOUSE_LOCAL -n -q """
|
||||
set use_hive_partitioning = 0;
|
||||
@ -93,24 +65,10 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'"
|
||||
$CLICKHOUSE_CLIENT -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
|
||||
SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0;
|
||||
SELECT *, non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
|
||||
"""
|
||||
|
||||
$CLICKHOUSE_CLIENT -n -q """
|
||||
@ -124,13 +82,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3CLUSTER PARTITIONING'"
|
||||
$CLICKHOUSE_CLIENT -n -q """
|
||||
set use_hive_partitioning = 1;
|
||||
|
||||
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;
|
||||
|
||||
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1;
|
||||
|
||||
SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
|
||||
SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10;
|
||||
"""
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user