Skip to content

GH-44308: [C++][FS][Azure] Implement SAS token authentication #45021

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
1abba8e
Working with ConfigureSasCredential
Tom-Newton Dec 11, 2024
ea091b4
Real access test working against azurite
Tom-Newton Dec 12, 2024
6df79f7
Fix creating getting Azure env in test
Tom-Newton Dec 12, 2024
567488b
Use credential_kind_ to detect credential type inside GenerateSASToke…
Tom-Newton Dec 12, 2024
6869067
Use original sas token inside `CopyFile` instead of generating a new …
Tom-Newton Dec 12, 2024
0ffd832
Fix appending SAS token twice - all tests pass
Tom-Newton Dec 12, 2024
a2b826c
Support SAS token in AzureOptions::FromUri
Tom-Newton Dec 12, 2024
9c6ec0e
Update SasCredential test to cover extra query parmaters
Tom-Newton Dec 12, 2024
f4177be
Adjust comment and increase SAS expiry time
Tom-Newton Dec 13, 2024
d19e87f
Update comments
Tom-Newton Dec 13, 2024
15fe2c6
Autoformat
Tom-Newton Dec 13, 2024
b57ebea
PR comments
Tom-Newton Dec 14, 2024
e200b8e
Avoid generating SAS tokens on the fly.
Tom-Newton Dec 14, 2024
b0fd1d3
Add an extra test copying to a different container
Tom-Newton Dec 14, 2024
744e111
Update comment
Tom-Newton Dec 14, 2024
c8fd94d
Remove AzureOptions::GenerateSASToken
Tom-Newton Dec 14, 2024
83a0b40
Remove debug change
Tom-Newton Dec 14, 2024
d60742b
Rename Sas -> SAS
Tom-Newton Dec 15, 2024
63f9773
Add a comment about the polling interval
Tom-Newton Dec 15, 2024
d55510a
Only allow query parameters we know can be part of a sas token
Tom-Newton Dec 15, 2024
7576c4c
More renaming Sas -> SAS
Tom-Newton Dec 15, 2024
eece05e
Give up on `contexpr`. Make it a `static const std::set<std::string>`
Tom-Newton Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 54 additions & 35 deletions cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
std::string tenant_id;
std::string client_id;
std::string client_secret;

// These query parameters are the union of the following docs:
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas
// (excluding parameters for table storage only)
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas
static const std::set<std::string> sas_token_query_parameters = {
"sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr",
"skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid",
"scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct",
};

ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
for (const auto& kv : options_items) {
if (kv.first == "blob_storage_authority") {
Expand Down Expand Up @@ -147,6 +159,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
} else if (kv.first == "background_writes") {
ARROW_ASSIGN_OR_RAISE(background_writes,
::arrow::internal::ParseBoolean(kv.second));
} else if (sas_token_query_parameters.find(kv.first) !=
sas_token_query_parameters.end()) {
credential_kind = CredentialKind::kSASToken;
} else {
return Status::Invalid(
"Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'");
Expand Down Expand Up @@ -180,6 +195,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
case CredentialKind::kEnvironment:
RETURN_NOT_OK(ConfigureEnvironmentCredential());
break;
case CredentialKind::kSASToken:
// Reconstructing the SAS token without the other URI query parameters is awkward
// because some parts are URI escaped and some parts are not. Instead we just
// pass through the entire query string and Azure ignores the extra query
// parameters.
RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string()));
break;
default:
// Default credential
break;
Expand Down Expand Up @@ -225,7 +247,6 @@ Result<AzureOptions> AzureOptions::FromUri(const std::string& uri_string,
}

bool AzureOptions::Equals(const AzureOptions& other) const {
// TODO(GH-38598): update here when more auth methods are added.
const bool equals = blob_storage_authority == other.blob_storage_authority &&
dfs_storage_authority == other.dfs_storage_authority &&
blob_storage_scheme == other.blob_storage_scheme &&
Expand All @@ -243,6 +264,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const {
case CredentialKind::kStorageSharedKey:
return storage_shared_key_credential_->AccountName ==
other.storage_shared_key_credential_->AccountName;
case CredentialKind::kSASToken:
return sas_token_ == other.sas_token_;
case CredentialKind::kClientSecret:
case CredentialKind::kCLI:
case CredentialKind::kManagedIdentity:
Expand Down Expand Up @@ -311,6 +334,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke
return Status::OK();
}

Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) {
credential_kind_ = CredentialKind::kSASToken;
if (account_name.empty()) {
return Status::Invalid("AzureOptions doesn't contain a valid account name");
}
sas_token_ = sas_token;
return Status::OK();
}

Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret) {
Expand Down Expand Up @@ -372,6 +404,9 @@ Result<std::unique_ptr<Blobs::BlobServiceClient>> AzureOptions::MakeBlobServiceC
case CredentialKind::kStorageSharedKey:
return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name),
storage_shared_key_credential_);
case CredentialKind::kSASToken:
return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name) +
sas_token_);
}
return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
}
Expand Down Expand Up @@ -404,29 +439,13 @@ AzureOptions::MakeDataLakeServiceClient() const {
case CredentialKind::kStorageSharedKey:
return std::make_unique<DataLake::DataLakeServiceClient>(
AccountDfsUrl(account_name), storage_shared_key_credential_);
case CredentialKind::kSASToken:
return std::make_unique<DataLake::DataLakeServiceClient>(
AccountBlobUrl(account_name) + sas_token_);
}
return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
}

Result<std::string> AzureOptions::GenerateSASToken(
Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const {
using SasProtocol = Storage::Sas::SasProtocol;
builder->Protocol =
blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly;
if (storage_shared_key_credential_) {
return builder->GenerateSasToken(*storage_shared_key_credential_);
} else {
// GH-39344: This part isn't tested. This may not work.
try {
auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn);
return builder->GenerateSasToken(delegation_key_response.Value, account_name);
} catch (const Storage::StorageException& exception) {
return ExceptionToStatus(exception, "GetUserDelegationKey failed for '",
client->GetUrl(), "'.");
}
}
}

namespace {

// An AzureFileSystem represents an Azure storage account. An AzureLocation describes a
Expand Down Expand Up @@ -3161,19 +3180,7 @@ class AzureFileSystem::Impl {
if (src == dest) {
return Status::OK();
}
std::string sas_token;
{
Storage::Sas::BlobSasBuilder builder;
std::chrono::seconds available_period(60);
builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
builder.BlobContainerName = src.container;
builder.BlobName = src.path;
builder.Resource = Storage::Sas::BlobSasResource::Blob;
builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read);
ARROW_ASSIGN_OR_RAISE(
sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get()));
}
auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token;
auto src_url = GetBlobClient(src.container, src.path).GetUrl();
auto dest_blob_client = GetBlobClient(dest.container, dest.path);
if (!dest.path.empty()) {
auto dest_parent = dest.parent();
Expand All @@ -3186,9 +3193,21 @@ class AzureFileSystem::Impl {
}
}
try {
dest_blob_client.CopyFromUri(src_url);
// We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger
// than 256 MiB and it doesn't require generating a SAS token to authenticate
// reading a source blob in the same storage account.
auto copy_operation = dest_blob_client.StartCopyFromUri(src_url);
// For large blobs, the copy operation may be slow so we need to poll until it
// completes. We use a polling interval of 1 second.
copy_operation.PollUntilDone(std::chrono::milliseconds(1000));
} catch (const Storage::StorageException& exception) {
return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ",
// StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed.
return ExceptionToStatus(
exception, "Failed to start blob copy or poll status of ongoing copy. (",
src_url, " -> ", dest_blob_client.GetUrl(), ")");
} catch (const Azure::Core::RequestFailedException& exception) {
// A GetProperties call inside PollUntilDone returned a failed CopyStatus.
return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ",
dest_blob_client.GetUrl(), ")");
}
return Status::OK();
Expand Down
14 changes: 6 additions & 8 deletions cpp/src/arrow/filesystem/azurefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs {
class BlobServiceClient;
}

namespace Azure::Storage::Sas {
struct BlobSasBuilder;
}

namespace Azure::Storage::Files::DataLake {
class DataLakeFileSystemClient;
class DataLakeServiceClient;
Expand Down Expand Up @@ -120,6 +116,7 @@ struct ARROW_EXPORT AzureOptions {
kDefault,
kAnonymous,
kStorageSharedKey,
kSASToken,
kClientSecret,
kManagedIdentity,
kCLI,
Expand All @@ -129,6 +126,7 @@ struct ARROW_EXPORT AzureOptions {

std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
storage_shared_key_credential_;
std::string sas_token_;
mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;

public:
Expand Down Expand Up @@ -180,6 +178,9 @@ struct ARROW_EXPORT AzureOptions {
/// AzureOptions::ConfigureClientSecretCredential() is called.
/// * client_secret: You must specify "tenant_id" and "client_id"
/// too. AzureOptions::ConfigureClientSecretCredential() is called.
/// * A SAS token is made up of several query parameters. Appending a SAS
/// token to the URI configures SAS token auth by calling
/// AzureOptions::ConfigureSASCredential().
///
/// [1]:
/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
Expand All @@ -189,6 +190,7 @@ struct ARROW_EXPORT AzureOptions {
Status ConfigureDefaultCredential();
Status ConfigureAnonymousCredential();
Status ConfigureAccountKeyCredential(const std::string& account_key);
Status ConfigureSASCredential(const std::string& sas_token);
Status ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret);
Expand All @@ -207,10 +209,6 @@ struct ARROW_EXPORT AzureOptions {

Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
MakeDataLakeServiceClient() const;

Result<std::string> GenerateSASToken(
Azure::Storage::Sas::BlobSasBuilder* builder,
Azure::Storage::Blobs::BlobServiceClient* client) const;
};

/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
Expand Down
88 changes: 88 additions & 0 deletions cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,36 @@ class TestAzureOptions : public ::testing::Test {
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment);
}

void TestFromUriCredentialSASToken() {
const std::string sas_token =
"?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%"
"2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
ASSERT_OK_AND_ASSIGN(
auto options,
AzureOptions::FromUri(
"abfs://[email protected]/" + sas_token, nullptr));
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
ASSERT_EQ(options.sas_token_, sas_token);
}

void TestFromUriCredentialSASTokenWithOtherParameters() {
const std::string uri_query_string =
"?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%"
"2B1SqLxPK%"
"2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
ASSERT_OK_AND_ASSIGN(
auto options,
AzureOptions::FromUri(
"abfs://[email protected]:10000/container/dir/blob" + uri_query_string,
nullptr));
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
ASSERT_EQ(options.sas_token_, uri_query_string);
ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000");
ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000");
ASSERT_EQ(options.blob_storage_scheme, "http");
ASSERT_EQ(options.dfs_storage_scheme, "http");
}

void TestFromUriCredentialInvalid() {
ASSERT_RAISES(Invalid, AzureOptions::FromUri(
"abfs://[email protected]/dir/file?"
Expand Down Expand Up @@ -777,6 +807,10 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) {
TEST_F(TestAzureOptions, FromUriCredentialEnvironment) {
TestFromUriCredentialEnvironment();
}
TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); }
TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) {
TestFromUriCredentialSASTokenWithOtherParameters();
}
TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); }
TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) {
TestFromUriBlobStorageAuthority();
Expand Down Expand Up @@ -912,6 +946,20 @@ class TestAzureFileSystem : public ::testing::Test {
.Value;
}

Result<std::string> GetContainerSASToken(
const std::string& container_name,
Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) {
std::string sas_token;
Azure::Storage::Sas::BlobSasBuilder builder;
std::chrono::seconds available_period(60);
builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
builder.BlobContainerName = container_name;
builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer;
builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All);
builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp;
return builder.GenerateSasToken(storage_shared_key_credential);
}

void UploadLines(const std::vector<std::string>& lines, const std::string& path,
int total_size) {
ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
Expand Down Expand Up @@ -1617,6 +1665,31 @@ class TestAzureFileSystem : public ::testing::Test {
AssertObjectContents(fs.get(), path, payload);
}

void TestSASCredential() {
auto data = SetUpPreexistingData();

ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv());
ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env));
ASSERT_OK_AND_ASSIGN(
auto sas_token,
GetContainerSASToken(data.container_name,
Azure::Storage::StorageSharedKeyCredential(
env->account_name(), env->account_key())));
// AzureOptions::FromUri will not cut off extra query parameters that it consumes, so
// make sure these don't cause problems.
ARROW_EXPECT_OK(options.ConfigureSASCredential(
"?blob_storage_authority=dummy_value0&" + sas_token.substr(1) +
"&credential_kind=dummy-value1"));
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));

AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File);

// Test CopyFile because the most obvious implementation requires generating a SAS
// token at runtime which doesn't work when the original auth is SAS token.
ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy"));
AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File);
}

private:
using StringMatcher =
::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher<std::string>>;
Expand Down Expand Up @@ -2328,6 +2401,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) {

TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); }

TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) {
this->TestSASCredential();
}

// Tests using Azurite (the local Azure emulator)

TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) {
Expand Down Expand Up @@ -2634,6 +2711,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) {
EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
}

TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) {
auto data = SetUpPreexistingData();
auto data2 = SetUpPreexistingData();
const auto destination_path = data2.ContainerPath("copy-destionation");
ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path));
ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path));
ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info));
ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024));
EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
}

TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) {
auto data = SetUpPreexistingData();
ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath()));
Expand Down
Loading