From 29b7bf64d43828c51a6c947f08e66697921ddc86 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 21 Feb 2024 20:04:12 +0100 Subject: [PATCH 01/14] Fix issues with endpoint and prefix --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 92 +++++++++++++++---- .../test.py | 26 ++++++ 2 files changed, 101 insertions(+), 17 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 72c4abee5c9..ef24bdcf951 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -23,8 +23,32 @@ namespace ErrorCodes struct AzureBlobStorageEndpoint { const String storage_account_url; + const String account_name; const String container_name; + const String prefix; const std::optional container_already_exists; + + String getEndpoint() + { + String url = storage_account_url; + + if (!account_name.empty()) + url += "/" + account_name; + + url += "/" + container_name + "/" + prefix; + + return url; + } + + String getEndpointWithoutContainer() + { + String url = storage_account_url; + + if (!account_name.empty()) + url += "/" + account_name; + + return url; + } }; @@ -58,28 +82,64 @@ void validateContainerName(const String & container_name) AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix) { - std::string storage_url; - if (config.has(config_prefix + ".storage_account_url")) + String storage_url; + String account_name; + String container_name; + String prefix; + if (config.has(config_prefix + ".endpoint")) + { + String endpoint = config.getString(config_prefix + ".endpoint"); + + /// For authenitication methods other than managed identity (Eg: SAS, Workload Identity), + /// account name is not present in the endpoint + /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) + + bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); + + size_t pos = endpoint.find("//"); + assert (pos != std::string::npos); + + if (endpoint_contains_account_name) + { + size_t pos_begin = endpoint.find('/', pos+2); + size_t pos_end = endpoint.find('/',pos_begin+1); + account_name = endpoint.substr(pos_begin+1,(pos_end-pos_begin)-1); + + size_t cont_pos_end = endpoint.find('/', pos_end+1); + container_name = endpoint.substr(pos_end+1,(cont_pos_end-pos_end)-1); + prefix = endpoint.substr(cont_pos_end+1); + storage_url = endpoint.substr(0,pos_begin); + } + else + { + size_t pos_begin = endpoint.find('/', pos+2); + size_t pos_end = endpoint.find('/',pos_begin+1); + container_name = endpoint.substr(pos_begin+1,(pos_end-pos_begin)-1); + + container_name = endpoint.substr(pos_end+1); + storage_url = endpoint.substr(0,pos_begin); + } + } + else if (config.has(config_prefix + ".connection_string")) + { + storage_url = config.getString(config_prefix + ".connection_string"); + container_name = config.getString(config_prefix + ".container_name"); + } + else if (config.has(config_prefix + ".storage_account_url")) { storage_url = config.getString(config_prefix + ".storage_account_url"); validateStorageAccountUrl(storage_url); + container_name = config.getString(config_prefix + ".container_name"); } else - { - if (config.has(config_prefix + ".connection_string")) - storage_url = config.getString(config_prefix + ".connection_string"); - else if (config.has(config_prefix + ".endpoint")) - storage_url = config.getString(config_prefix + ".endpoint"); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected either `connection_string` or `endpoint` in config"); - } + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected either `storage_account_url` or `connection_string` or `endpoint` in config"); - String container_name = config.getString(config_prefix + ".container_name", "default-container"); - validateContainerName(container_name); + if (!container_name.empty()) + validateContainerName(container_name); std::optional container_already_exists {}; if (config.has(config_prefix + ".container_already_exists")) container_already_exists = {config.getBool(config_prefix + ".container_already_exists")}; - return {storage_url, container_name, container_already_exists}; + return {storage_url, account_name, container_name, prefix, container_already_exists}; } @@ -133,15 +193,13 @@ std::unique_ptr getAzureBlobContainerClient( { auto endpoint = processAzureBlobStorageEndpoint(config, config_prefix); auto container_name = endpoint.container_name; - auto final_url = container_name.empty() - ? endpoint.storage_account_url - : (std::filesystem::path(endpoint.storage_account_url) / container_name).string(); + auto final_url = endpoint.getEndpoint(); if (endpoint.container_already_exists.value_or(false)) return getAzureBlobStorageClientWithAuth(final_url, container_name, config, config_prefix); auto blob_service_client = getAzureBlobStorageClientWithAuth( - endpoint.storage_account_url, container_name, config, config_prefix); + endpoint.getEndpointWithoutContainer(), container_name, config, config_prefix); try { diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index f3e113c95d3..de8de347cdf 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -632,3 +632,29 @@ def test_endpoint(cluster): ) assert 10 == int(node.query("SELECT count() FROM test")) + +def test_endpoint_new_container(cluster): + node = cluster.instances[NODE_NAME] + account_name = "devstoreaccount1" + container_name = "cont3" + data_prefix = "data_prefix" + port = cluster.azurite_port + + node.query( + f""" + DROP TABLE IF EXISTS test SYNC; + + CREATE TABLE test (a Int32) + ENGINE = MergeTree() ORDER BY tuple() + SETTINGS disk = disk( + type = azure_blob_storage, + endpoint = 'http://azurite1:{port}/{account_name}/{container_name}/{data_prefix}', + account_name = 'devstoreaccount1', + account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + skip_access_check = 0); + + INSERT INTO test SELECT number FROM numbers(10); + """ + ) + + assert 10 == int(node.query("SELECT count() FROM test")) From 3b11218e74a61dca93fa38f9f635a5cc8ddf3c1d Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 21 Feb 2024 19:21:14 +0000 Subject: [PATCH 02/14] Automatic style fix --- tests/integration/test_merge_tree_azure_blob_storage/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index de8de347cdf..2e72bc78524 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -633,6 +633,7 @@ def test_endpoint(cluster): assert 10 == int(node.query("SELECT count() FROM test")) + def test_endpoint_new_container(cluster): node = cluster.instances[NODE_NAME] account_name = "devstoreaccount1" From f1d5892d50c4d25a0aa3f14fcc4907655b27d030 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 22 Feb 2024 10:06:33 +0100 Subject: [PATCH 03/14] Fix style check --- .../ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index ef24bdcf951..659adb8e505 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -90,8 +90,7 @@ AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::Abstr { String endpoint = config.getString(config_prefix + ".endpoint"); - /// For authenitication methods other than managed identity (Eg: SAS, Workload Identity), - /// account name is not present in the endpoint + /// For some authentication methods account name is not present in the endpoint /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); From 0eefab131dc120571eb5cd8b451d654b425fdc0f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 22 Feb 2024 11:05:33 +0100 Subject: [PATCH 04/14] Updated pos check --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 67 ++++++++++++++----- .../test.py | 25 +++++++ 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 659adb8e505..9d11bb23578 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -35,7 +35,11 @@ struct AzureBlobStorageEndpoint if (!account_name.empty()) url += "/" + account_name; - url += "/" + container_name + "/" + prefix; + if (!container_name.empty()) + url += "/" + container_name; + + if (!prefix.empty()) + url += "/" + prefix; return url; } @@ -92,31 +96,64 @@ AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::Abstr /// For some authentication methods account name is not present in the endpoint /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) - bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); size_t pos = endpoint.find("//"); - assert (pos != std::string::npos); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected '//' in endpoint"); if (endpoint_contains_account_name) { - size_t pos_begin = endpoint.find('/', pos+2); - size_t pos_end = endpoint.find('/',pos_begin+1); - account_name = endpoint.substr(pos_begin+1,(pos_end-pos_begin)-1); + size_t acc_pos_begin = endpoint.find('/', pos+2); + if (acc_pos_begin == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected account_name in endpoint"); - size_t cont_pos_end = endpoint.find('/', pos_end+1); - container_name = endpoint.substr(pos_end+1,(cont_pos_end-pos_end)-1); - prefix = endpoint.substr(cont_pos_end+1); - storage_url = endpoint.substr(0,pos_begin); + storage_url = endpoint.substr(0,acc_pos_begin); + size_t acc_pos_end = endpoint.find('/',acc_pos_begin+1); + + if (acc_pos_end != std::string::npos) + { + account_name = endpoint.substr(acc_pos_begin+1,(acc_pos_end-acc_pos_begin)-1); + + size_t cont_pos_end = endpoint.find('/', acc_pos_end+1); + + if (cont_pos_end != std::string::npos) + { + container_name = endpoint.substr(acc_pos_end+1,(cont_pos_end-acc_pos_end)-1); + prefix = endpoint.substr(cont_pos_end+1); + } + else + { + container_name = endpoint.substr(acc_pos_end+1); + } + } + else + { + account_name = endpoint.substr(acc_pos_begin+1); + } } else { - size_t pos_begin = endpoint.find('/', pos+2); - size_t pos_end = endpoint.find('/',pos_begin+1); - container_name = endpoint.substr(pos_begin+1,(pos_end-pos_begin)-1); + size_t cont_pos_begin = endpoint.find('/', pos+2); + if (cont_pos_begin != std::string::npos) + { + storage_url = endpoint.substr(0,cont_pos_begin); + size_t cont_pos_end = endpoint.find('/',cont_pos_begin+1); - container_name = endpoint.substr(pos_end+1); - storage_url = endpoint.substr(0,pos_begin); + if (cont_pos_end != std::string::npos) + { + container_name = endpoint.substr(cont_pos_begin+1,(cont_pos_end-cont_pos_begin)-1); + prefix = endpoint.substr(cont_pos_end+1); + } + else + { + container_name = endpoint.substr(cont_pos_begin+1); + } + } + else + { + storage_url = endpoint; + } } } else if (config.has(config_prefix + ".connection_string")) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 2e72bc78524..eef5a230b40 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -659,3 +659,28 @@ def test_endpoint_new_container(cluster): ) assert 10 == int(node.query("SELECT count() FROM test")) + +def test_endpoint_without_prefix(cluster): + node = cluster.instances[NODE_NAME] + account_name = "devstoreaccount1" + container_name = "cont4" + port = cluster.azurite_port + + node.query( + f""" + DROP TABLE IF EXISTS test SYNC; + + CREATE TABLE test (a Int32) + ENGINE = MergeTree() ORDER BY tuple() + SETTINGS disk = disk( + type = azure_blob_storage, + endpoint = 'http://azurite1:{port}/{account_name}/{container_name}', + account_name = 'devstoreaccount1', + account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + skip_access_check = 0); + + INSERT INTO test SELECT number FROM numbers(10); + """ + ) + + assert 10 == int(node.query("SELECT count() FROM test")) From 595ac95a5b25221a6a5ec8d72ea06a6b01777da6 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 22 Feb 2024 10:15:46 +0000 Subject: [PATCH 05/14] Automatic style fix --- tests/integration/test_merge_tree_azure_blob_storage/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index eef5a230b40..0f66399ae0a 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -660,6 +660,7 @@ def test_endpoint_new_container(cluster): assert 10 == int(node.query("SELECT count() FROM test")) + def test_endpoint_without_prefix(cluster): node = cluster.instances[NODE_NAME] account_name = "devstoreaccount1" From 86694596b3f570520deeab591f768c33f700a67d Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 23 Feb 2024 11:12:42 +0100 Subject: [PATCH 06/14] Fix flaky test --- .../test_merge_tree_azure_blob_storage/test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 0f66399ae0a..2b533c7104e 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -613,7 +613,8 @@ def test_endpoint(cluster): container_client = cluster.blob_service_client.get_container_client(container_name) container_client.create_container() - node.query( + azure_query( + node, f""" DROP TABLE IF EXISTS test SYNC; @@ -641,7 +642,8 @@ def test_endpoint_new_container(cluster): data_prefix = "data_prefix" port = cluster.azurite_port - node.query( + azure_query( + node, f""" DROP TABLE IF EXISTS test SYNC; @@ -667,7 +669,8 @@ def test_endpoint_without_prefix(cluster): container_name = "cont4" port = cluster.azurite_port - node.query( + azure_query( + node, f""" DROP TABLE IF EXISTS test SYNC; From 5f39784093787fb5e0aaf957a403b104beb07d72 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 23 Feb 2024 10:24:00 +0000 Subject: [PATCH 07/14] Automatic style fix --- .../integration/test_merge_tree_azure_blob_storage/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 2b533c7104e..b0373ffb811 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -629,7 +629,7 @@ def test_endpoint(cluster): skip_access_check = 0); INSERT INTO test SELECT number FROM numbers(10); - """ + """, ) assert 10 == int(node.query("SELECT count() FROM test")) @@ -657,7 +657,7 @@ def test_endpoint_new_container(cluster): skip_access_check = 0); INSERT INTO test SELECT number FROM numbers(10); - """ + """, ) assert 10 == int(node.query("SELECT count() FROM test")) @@ -684,7 +684,7 @@ def test_endpoint_without_prefix(cluster): skip_access_check = 0); INSERT INTO test SELECT number FROM numbers(10); - """ + """, ) assert 10 == int(node.query("SELECT count() FROM test")) From 6ec23a08791c02c6b381cdd4cdd02981ddb7205a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 26 Feb 2024 14:36:06 +0100 Subject: [PATCH 08/14] Updated docs for endpoint --- docs/en/engines/table-engines/integrations/azureBlobStorage.md | 2 ++ docs/en/engines/table-engines/mergetree-family/mergetree.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index c6525121667..70d696f9684 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -19,6 +19,8 @@ CREATE TABLE azure_blob_storage_table (name String, value UInt32) ### Engine parameters +- `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (http://azurite1:{port}/[account_name]{container_name}/{data_prefix}) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. +- `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. - `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) - `container_name` - Container name - `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index f185c11bab3..cbceeb05c2a 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1236,6 +1236,8 @@ Configuration markup: ``` Connection parameters: +* `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (http://azurite1:{port}/[account_name]{container_name}/{data_prefix}) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. +* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. * `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. * `container_name` - Target container name, defaults to `default-container`. * `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. From ae309e6ea9972f801e8b80613ff2b2c6f3c67231 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 27 Feb 2024 11:12:19 +0100 Subject: [PATCH 09/14] Updated to set proper prefix for azure blob storage disk --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 35 ------------------ .../AzureBlobStorage/AzureBlobStorageAuth.h | 37 +++++++++++++++++++ .../ObjectStorages/ObjectStorageFactory.cpp | 4 +- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 9d11bb23578..5657eab1872 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -20,41 +20,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -struct AzureBlobStorageEndpoint -{ - const String storage_account_url; - const String account_name; - const String container_name; - const String prefix; - const std::optional container_already_exists; - - String getEndpoint() - { - String url = storage_account_url; - - if (!account_name.empty()) - url += "/" + account_name; - - if (!container_name.empty()) - url += "/" + container_name; - - if (!prefix.empty()) - url += "/" + prefix; - - return url; - } - - String getEndpointWithoutContainer() - { - String url = storage_account_url; - - if (!account_name.empty()) - url += "/" + account_name; - - return url; - } -}; - void validateStorageAccountUrl(const String & storage_account_url) { diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.h index 18e8bf159d5..20bf05d5ba6 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.h @@ -10,9 +10,46 @@ namespace DB { +struct AzureBlobStorageEndpoint +{ + const String storage_account_url; + const String account_name; + const String container_name; + const String prefix; + const std::optional container_already_exists; + + String getEndpoint() + { + String url = storage_account_url; + + if (!account_name.empty()) + url += "/" + account_name; + + if (!container_name.empty()) + url += "/" + container_name; + + if (!prefix.empty()) + url += "/" + prefix; + + return url; + } + + String getEndpointWithoutContainer() + { + String url = storage_account_url; + + if (!account_name.empty()) + url += "/" + account_name; + + return url; + } +}; + std::unique_ptr getAzureBlobContainerClient( const Poco::Util::AbstractConfiguration & config, const String & config_prefix); +AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix); + std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/); } diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index b3626135177..d13cf1156f8 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -213,12 +213,12 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) const ContextPtr & context, bool /* skip_access_check */) -> ObjectStoragePtr { - String container_name = config.getString(config_prefix + ".container_name", "default-container"); + AzureBlobStorageEndpoint endpoint = processAzureBlobStorageEndpoint(config, config_prefix); return std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), - container_name); + endpoint.prefix ? endpoint.container_name + "/" + endpoint.prefix : endpoint.container); }); } From 414c8da128d2f3cf6611f9642401a809193c22a7 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 27 Feb 2024 18:09:21 +0100 Subject: [PATCH 10/14] Fix build --- src/Disks/ObjectStorages/ObjectStorageFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index d13cf1156f8..9a3ae432eca 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -218,7 +218,7 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), - endpoint.prefix ? endpoint.container_name + "/" + endpoint.prefix : endpoint.container); + endpoint.prefix.empty() ? endpoint.container : endpoint.container_name + "/" + endpoint.prefix); }); } From ca05557659de3f7fcd14d47acf1d69a5da18d924 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 27 Feb 2024 19:16:36 +0100 Subject: [PATCH 11/14] Fix typo --- src/Disks/ObjectStorages/ObjectStorageFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 9a3ae432eca..f4df579de73 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -218,7 +218,7 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), - endpoint.prefix.empty() ? endpoint.container : endpoint.container_name + "/" + endpoint.prefix); + endpoint.prefix.empty() ? endpoint.container_name : endpoint.container_name + "/" + endpoint.prefix); }); } From a46d7c91910d5077024f113377f42e469b48849e Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 28 Feb 2024 11:25:35 +0100 Subject: [PATCH 12/14] Updated docs and addressed comments --- .../mergetree-family/mergetree.md | 6 +- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 55 ++++++++---------- .../AzureBlobStorage/AzureObjectStorage.cpp | 6 +- .../AzureBlobStorage/AzureObjectStorage.h | 4 +- .../test.py | 58 +++++++++++++++++++ 5 files changed, 90 insertions(+), 39 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index cbceeb05c2a..5f66c2efecd 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1236,9 +1236,9 @@ Configuration markup: ``` Connection parameters: -* `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (http://azurite1:{port}/[account_name]{container_name}/{data_prefix}) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. -* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. -* `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. +* `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (`http://account.blob.core.windows.net:{port}/[account_name]{container_name}/{data_prefix}`) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. +* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. (Default : false) +* `storage_account_url` - Required if endpoint is not specified, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. * `container_name` - Target container name, defaults to `default-container`. * `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 5657eab1872..7dfee3bc760 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -60,8 +60,8 @@ AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::Abstr String endpoint = config.getString(config_prefix + ".endpoint"); /// For some authentication methods account name is not present in the endpoint - /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) - bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); + /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : false) + bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", false); size_t pos = endpoint.find("//"); if (pos == std::string::npos) @@ -76,48 +76,41 @@ AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::Abstr storage_url = endpoint.substr(0,acc_pos_begin); size_t acc_pos_end = endpoint.find('/',acc_pos_begin+1); - if (acc_pos_end != std::string::npos) + if (acc_pos_end == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); + + account_name = endpoint.substr(acc_pos_begin+1,(acc_pos_end-acc_pos_begin)-1); + + size_t cont_pos_end = endpoint.find('/', acc_pos_end+1); + + if (cont_pos_end != std::string::npos) { - account_name = endpoint.substr(acc_pos_begin+1,(acc_pos_end-acc_pos_begin)-1); - - size_t cont_pos_end = endpoint.find('/', acc_pos_end+1); - - if (cont_pos_end != std::string::npos) - { - container_name = endpoint.substr(acc_pos_end+1,(cont_pos_end-acc_pos_end)-1); - prefix = endpoint.substr(cont_pos_end+1); - } - else - { - container_name = endpoint.substr(acc_pos_end+1); - } + container_name = endpoint.substr(acc_pos_end+1,(cont_pos_end-acc_pos_end)-1); + prefix = endpoint.substr(cont_pos_end+1); } else { - account_name = endpoint.substr(acc_pos_begin+1); + container_name = endpoint.substr(acc_pos_end+1); } } else { size_t cont_pos_begin = endpoint.find('/', pos+2); - if (cont_pos_begin != std::string::npos) - { - storage_url = endpoint.substr(0,cont_pos_begin); - size_t cont_pos_end = endpoint.find('/',cont_pos_begin+1); - if (cont_pos_end != std::string::npos) - { - container_name = endpoint.substr(cont_pos_begin+1,(cont_pos_end-cont_pos_begin)-1); - prefix = endpoint.substr(cont_pos_end+1); - } - else - { - container_name = endpoint.substr(cont_pos_begin+1); - } + if (cont_pos_begin == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); + + storage_url = endpoint.substr(0,cont_pos_begin); + size_t cont_pos_end = endpoint.find('/',cont_pos_begin+1); + + if (cont_pos_end != std::string::npos) + { + container_name = endpoint.substr(cont_pos_begin+1,(cont_pos_end-cont_pos_begin)-1); + prefix = endpoint.substr(cont_pos_end+1); } else { - storage_url = endpoint; + container_name = endpoint.substr(cont_pos_begin+1); } } } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 74389aedb64..fba61c7c392 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -93,11 +93,11 @@ AzureObjectStorage::AzureObjectStorage( const String & name_, AzureClientPtr && client_, SettingsPtr && settings_, - const String & container_) + const String & object_namespace_) : name(name_) , client(std::move(client_)) , settings(std::move(settings_)) - , container(container_) + , object_namespace(object_namespace_) , log(getLogger("AzureObjectStorage")) { } @@ -379,7 +379,7 @@ std::unique_ptr AzureObjectStorage::cloneObjectStorage(const std name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), - container + object_namespace ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index f16c35fb52c..8109b8eaf54 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -130,7 +130,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - String getObjectsNamespace() const override { return container ; } + String getObjectsNamespace() const override { return object_namespace ; } std::unique_ptr cloneObjectStorage( const std::string & new_namespace, @@ -154,7 +154,7 @@ private: /// client used to access the files in the Blob Storage cloud MultiVersion client; MultiVersion settings; - const String container; + const String object_namespace; /// container + prefix LoggerPtr log; }; diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index b0373ffb811..39bdf51eae3 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -623,6 +623,7 @@ def test_endpoint(cluster): SETTINGS disk = disk( type = azure_blob_storage, endpoint = 'http://azurite1:{port}/{account_name}/{container_name}/{data_prefix}', + endpoint_contains_account_name = 'true', account_name = 'devstoreaccount1', account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', container_already_exists = 1, @@ -652,6 +653,7 @@ def test_endpoint_new_container(cluster): SETTINGS disk = disk( type = azure_blob_storage, endpoint = 'http://azurite1:{port}/{account_name}/{container_name}/{data_prefix}', + endpoint_contains_account_name = 'true', account_name = 'devstoreaccount1', account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', skip_access_check = 0); @@ -679,6 +681,7 @@ def test_endpoint_without_prefix(cluster): SETTINGS disk = disk( type = azure_blob_storage, endpoint = 'http://azurite1:{port}/{account_name}/{container_name}', + endpoint_contains_account_name = 'true', account_name = 'devstoreaccount1', account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', skip_access_check = 0); @@ -688,3 +691,58 @@ def test_endpoint_without_prefix(cluster): ) assert 10 == int(node.query("SELECT count() FROM test")) + +def test_endpoint_error_check(cluster): + node = cluster.instances[NODE_NAME] + account_name = "devstoreaccount1" + port = cluster.azurite_port + + query = f""" + DROP TABLE IF EXISTS test SYNC; + + CREATE TABLE test (a Int32) + ENGINE = MergeTree() ORDER BY tuple() + SETTINGS disk = disk( + type = azure_blob_storage, + endpoint = 'http://azurite1:{port}/{account_name}', + endpoint_contains_account_name = 'true', + account_name = 'devstoreaccount1', + account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + skip_access_check = 0); + """ + + expected_err_msg = "Expected container_name in endpoint" + assert expected_err_msg in azure_query(node, query, expect_error="true") + + query = f""" + DROP TABLE IF EXISTS test SYNC; + + CREATE TABLE test (a Int32) + ENGINE = MergeTree() ORDER BY tuple() + SETTINGS disk = disk( + type = azure_blob_storage, + endpoint = 'http://azurite1:{port}', + endpoint_contains_account_name = 'true', + account_name = 'devstoreaccount1', + account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + skip_access_check = 0); + """ + + expected_err_msg = "Expected account_name in endpoint" + assert expected_err_msg in azure_query(node, query, expect_error="true") + + query = f""" + DROP TABLE IF EXISTS test SYNC; + + CREATE TABLE test (a Int32) + ENGINE = MergeTree() ORDER BY tuple() + SETTINGS disk = disk( + type = azure_blob_storage, + endpoint = 'http://azurite1:{port}', + account_name = 'devstoreaccount1', + account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + skip_access_check = 0); + """ + + expected_err_msg = "Expected container_name in endpoint" + assert expected_err_msg in azure_query(node, query, expect_error="true") From 8145cd15b11e2873b9ede75d0e29df822b70f871 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 28 Feb 2024 10:34:40 +0000 Subject: [PATCH 13/14] Automatic style fix --- tests/integration/test_merge_tree_azure_blob_storage/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 39bdf51eae3..27aa6c12610 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -692,6 +692,7 @@ def test_endpoint_without_prefix(cluster): assert 10 == int(node.query("SELECT count() FROM test")) + def test_endpoint_error_check(cluster): node = cluster.instances[NODE_NAME] account_name = "devstoreaccount1" From 5d68c9f0465d3fb801e3f836ecd2d63f097fecf4 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 29 Feb 2024 09:38:13 +0100 Subject: [PATCH 14/14] Updated default value of endpoint_contains_account_name to true --- .../en/engines/table-engines/integrations/azureBlobStorage.md | 2 +- docs/en/engines/table-engines/mergetree-family/mergetree.md | 2 +- .../ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp | 4 ++-- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 2 +- tests/integration/test_merge_tree_azure_blob_storage/test.py | 1 + 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index 70d696f9684..0843ff1ac47 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -20,7 +20,7 @@ CREATE TABLE azure_blob_storage_table (name String, value UInt32) ### Engine parameters - `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (http://azurite1:{port}/[account_name]{container_name}/{data_prefix}) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. -- `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. +- `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. (Default : true) - `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) - `container_name` - Container name - `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 5f66c2efecd..a2c54cb412b 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1237,7 +1237,7 @@ Configuration markup: Connection parameters: * `endpoint` — AzureBlobStorage endpoint URL with container & prefix. Optionally can contain account_name if the authentication method used needs it. (`http://account.blob.core.windows.net:{port}/[account_name]{container_name}/{data_prefix}`) or these parameters can be provided separately using storage_account_url, account_name & container. For specifying prefix, endpoint should be used. -* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. (Default : false) +* `endpoint_contains_account_name` - This flag is used to specify if endpoint contains account_name as it is only needed for certain authentication methods. (Default : true) * `storage_account_url` - Required if endpoint is not specified, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. * `container_name` - Target container name, defaults to `default-container`. * `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 7dfee3bc760..d5815795744 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -60,8 +60,8 @@ AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::Abstr String endpoint = config.getString(config_prefix + ".endpoint"); /// For some authentication methods account name is not present in the endpoint - /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : false) - bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", false); + /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) + bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); size_t pos = endpoint.find("//"); if (pos == std::string::npos) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 8109b8eaf54..55d2f228bf0 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -67,7 +67,7 @@ public: const String & name_, AzureClientPtr && client_, SettingsPtr && settings_, - const String & container_); + const String & object_namespace_); void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 27aa6c12610..55deb87a97e 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -740,6 +740,7 @@ def test_endpoint_error_check(cluster): SETTINGS disk = disk( type = azure_blob_storage, endpoint = 'http://azurite1:{port}', + endpoint_contains_account_name = 'false', account_name = 'devstoreaccount1', account_key = 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', skip_access_check = 0);