diff --git a/openeo/extra/artifacts/__init__.py b/openeo/extra/artifacts/__init__.py new file mode 100644 index 000000000..585a494a7 --- /dev/null +++ b/openeo/extra/artifacts/__init__.py @@ -0,0 +1 @@ +from openeo.extra.artifacts.artifact_helper import ArtifactHelper diff --git a/openeo/extra/artifacts/_s3sts/__init__.py b/openeo/extra/artifacts/_s3sts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openeo/extra/artifacts/_s3sts/artifact_helper.py b/openeo/extra/artifacts/_s3sts/artifact_helper.py new file mode 100644 index 000000000..dc8a5ffcc --- /dev/null +++ b/openeo/extra/artifacts/_s3sts/artifact_helper.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import datetime +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from types_boto3_s3.client import S3Client + +from pathlib import Path + +from boto3.s3.transfer import TransferConfig + +from openeo.extra.artifacts._s3sts.config import S3STSConfig +from openeo.extra.artifacts._s3sts.model import S3URI, AWSSTSCredentials +from openeo.extra.artifacts._s3sts.sts import OpenEOSTSClient +from openeo.extra.artifacts.artifact_helper_abc import ArtifactHelperABC +from openeo.rest.connection import Connection + + +class S3STSArtifactHelper(ArtifactHelperABC): + # From what size will we switch to multi-part-upload + MULTIPART_THRESHOLD_IN_MB = 50 + + def __init__(self, conn: Connection, config: S3STSConfig): + super().__init__(config) + self.conn = conn + self.config = config + self._creds = self.get_new_creds() + self._s3: Optional[S3Client] = None + + @classmethod + def _from_openeo_connection(cls, conn: Connection, config: S3STSConfig) -> S3STSArtifactHelper: + return S3STSArtifactHelper(conn, config=config) + + def get_new_creds(self) -> AWSSTSCredentials: + sts = OpenEOSTSClient(config=self.config) + return sts.assume_from_openeo_connection(self.conn) + + def _user_prefix(self) -> str: + """Each user has its own prefix retrieve it""" + return self._creds.get_user_hash() + + def _get_upload_prefix(self) -> str: + # TODO: replace utcnow when `datetime.datetime.now(datetime.UTC)` in oldest supported Python version + return f"{self._user_prefix()}/{datetime.datetime.utcnow().strftime('%Y/%m/%d')}/" + + def _get_upload_key(self, object_name: str) -> str: + return f"{self._get_upload_prefix()}{object_name}" + + @staticmethod + def get_object_name_from_path(path: str | Path) -> str: + if isinstance(path, str): + path = Path(path) + return path.name + + def _get_s3_client(self): + # TODO: validate whether credentials are still reasonably long valid + # and if not refresh credentials and rebuild client + if self._s3 is None: + self._s3 = self.config.build_client("s3", session_kwargs=self._creds.as_kwargs()) + return self._s3 + + def upload_file(self, path: str | Path, object_name: str = "") -> S3URI: + """ + Upload a file to a backend understanding the S3 API + + :param path A file path to the file that must be uploaded + :param object_name: Optional the final part of the name to be uploaded. If omitted the filename is used. + + :return: `S3URI` A S3URI that points to the uploaded file in the S3 compatible backend + """ + mb = 1024**2 + config = TransferConfig(multipart_threshold=self.MULTIPART_THRESHOLD_IN_MB * mb) + bucket = self.config.bucket + key = self._get_upload_key(object_name or self.get_object_name_from_path(path)) + self._get_s3_client().upload_file(str(path), bucket, key, Config=config) + return S3URI(bucket, key) + + def get_presigned_url(self, storage_uri: S3URI, expires_in_seconds: int = 7 * 3600 * 24) -> str: + """ + Get a presigned URL to allow retrieval of an object. + + :param storage_uri `S3URI` A S3URI that points to the uploaded file in the S3 compatible backend + :param expires_in_seconds: Optional the number of seconds the link is valid for (defaults to 7 days) + + :return: `str` A HTTP url that can be used to download a file. It also supports Range header in its requests. + """ + url = self._get_s3_client().generate_presigned_url( + "get_object", Params={"Bucket": storage_uri.bucket, "Key": storage_uri.key}, ExpiresIn=expires_in_seconds + ) + assert isinstance(self._config, S3STSConfig) + return self._config.add_trace_id_qp_if_needed(url) + + @classmethod + def _get_default_storage_config(cls) -> S3STSConfig: + return S3STSConfig() diff --git a/openeo/extra/artifacts/_s3sts/config.py b/openeo/extra/artifacts/_s3sts/config.py new file mode 100644 index 000000000..4b7f2d780 --- /dev/null +++ b/openeo/extra/artifacts/_s3sts/config.py @@ -0,0 +1,121 @@ +from dataclasses import dataclass, field +from typing import Optional + +import boto3 +import botocore +from botocore.config import Config + +from openeo.extra.artifacts._s3sts.tracer import ( + add_trace_id, + add_trace_id_as_query_parameter, +) +from openeo.extra.artifacts.backend import ProviderConfig +from openeo.extra.artifacts.config import StorageConfig +from openeo.utils.version import ComparableVersion + +if ComparableVersion(botocore.__version__).below("1.36.0"): + # Before 1.36 checksuming was not done by default anyway and therefore + # there was no opt-out. + no_default_checksum_config = Config() +else: + no_default_checksum_config = Config( + request_checksum_calculation="when_required", + ) + + +DISABLE_TRACING_TRACE_ID = "00000000-0000-0000-0000-000000000000" + + +@dataclass(frozen=True) +class S3STSConfig(StorageConfig): + """The s3 endpoint url protocol:://fqdn[:portnumber]""" + + s3_endpoint: Optional[str] = None + """The sts endpoint url protocol:://fqdn[:portnumber]""" + sts_endpoint: Optional[str] = None + """The trace_id is if you want to send a uuid4 identifier to the backend""" + trace_id: str = DISABLE_TRACING_TRACE_ID + """You can change the botocore_config used but this is an expert option""" + botocore_config: Config = field(default_factory=lambda: no_default_checksum_config) + """The role ARN to be assumed""" + role: Optional[str] = None + """The bucket to store the object into""" + bucket: Optional[str] = None + + def _load_connection_provided_config(self, provider_config: ProviderConfig) -> None: + if self.s3_endpoint is None: + object.__setattr__(self, "s3_endpoint", provider_config["s3_endpoint"]) + + if self.sts_endpoint is None: + object.__setattr__(self, "sts_endpoint", provider_config["sts_endpoint"]) + + if self.role is None: + object.__setattr__(self, "role", provider_config["role"]) + + if self.bucket is None: + object.__setattr__(self, "bucket", provider_config["bucket"]) + + def should_trace(self) -> bool: + return self.trace_id != DISABLE_TRACING_TRACE_ID + + def build_client(self, service_name: str, session_kwargs: Optional[dict] = None): + """ + Build a boto3 client for an OpenEO service provider. + + service_name is the service you want to consume: s3|sts + session_kwargs: a dictionary with keyword arguments that will be passed when creating the boto session + """ + session_kwargs = session_kwargs or {} + session = boto3.Session(region_name=self._get_storage_region(), **session_kwargs) + client = session.client( + service_name, + endpoint_url=self._get_endpoint_url(service_name), + config=self.botocore_config, + ) + if self.should_trace(): + add_trace_id(client, self.trace_id) + return client + + @staticmethod + def _remove_protocol_from_uri(uri: str): + uri_separator = "://" + idx = uri.find(uri_separator) + if idx < 0: + raise ValueError("_remove_protocol_from_uri must be of form protocol://...") + return uri[idx + len(uri_separator) :] + + def _get_storage_region(self) -> str: + """ + S3 URIs follow the convention detailed on https://docs.aws.amazon.com/general/latest/gr/s3.html + """ + s3_names = ["s3", "s3-fips"] + reserved_words = ["dualstack", "prod", "stag", "dev"] + s3_endpoint_parts = self._remove_protocol_from_uri(self.s3_endpoint).split(".") + for s3_name in s3_names: + try: + old_idx = s3_endpoint_parts.index(s3_name) + idx = old_idx + 1 + while idx != old_idx: + old_idx = idx + for reserved_word in reserved_words: + if s3_endpoint_parts[idx] in reserved_word: + idx += 1 + return s3_endpoint_parts[idx] + except ValueError: + continue + raise ValueError(f"Cannot determine region from {self.s3_endpoint}") + + def _get_endpoint_url(self, service_name: str) -> str: + if service_name == "s3": + return self.s3_endpoint + elif service_name == "sts": + return self.sts_endpoint + raise ValueError(f"Unsupported service {service_name}") + + def add_trace_id_qp_if_needed(self, url: str) -> str: + if not self.should_trace(): + return url + return add_trace_id_as_query_parameter(url, self.trace_id) + + def get_sts_role_arn(self) -> str: + return self.role diff --git a/openeo/extra/artifacts/_s3sts/model.py b/openeo/extra/artifacts/_s3sts/model.py new file mode 100644 index 000000000..f39f9cbc1 --- /dev/null +++ b/openeo/extra/artifacts/_s3sts/model.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import datetime +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from types_boto3_sts.type_defs import AssumeRoleWithWebIdentityResponseTypeDef + +import hashlib +from dataclasses import dataclass +from urllib.parse import urlparse + +from openeo.extra.artifacts.uri import StorageURI + + +@dataclass(frozen=True) +class AWSSTSCredentials: + aws_access_key_id: str + aws_secret_access_key: str + aws_session_token: str + subject_from_web_identity_token: str + expiration: datetime.datetime + + @classmethod + def from_assume_role_response(cls, resp: AssumeRoleWithWebIdentityResponseTypeDef) -> AWSSTSCredentials: + d = resp["Credentials"] + return AWSSTSCredentials( + aws_access_key_id=d["AccessKeyId"], + aws_secret_access_key=d["SecretAccessKey"], + aws_session_token=d["SessionToken"], + subject_from_web_identity_token=resp["SubjectFromWebIdentityToken"], + expiration=d["Expiration"], + ) + + def as_kwargs(self) -> dict: + return { + "aws_access_key_id": self.aws_access_key_id, + "aws_secret_access_key": self.aws_secret_access_key, + "aws_session_token": self.aws_session_token, + } + + def get_user_hash(self) -> str: + hash_object = hashlib.sha1(self.subject_from_web_identity_token.encode()) + return hash_object.hexdigest() + + +@dataclass(frozen=True) +class S3URI(StorageURI): + bucket: str + key: str + + @classmethod + def from_str(cls, uri: str) -> S3URI: + _parsed = urlparse(uri, allow_fragments=False) + if _parsed.scheme != "s3": + raise ValueError(f"Input {uri} is not a valid S3 URI should be of form s3:///") + bucket = _parsed.netloc + if _parsed.query: + key = _parsed.path.lstrip("/") + "?" + _parsed.query + else: + key = _parsed.path.lstrip("/") + + return S3URI(bucket, key) + + def to_string(self) -> str: + return f"s3://{self.bucket}/{self.key}" diff --git a/openeo/extra/artifacts/_s3sts/sts.py b/openeo/extra/artifacts/_s3sts/sts.py new file mode 100644 index 000000000..7473ddd84 --- /dev/null +++ b/openeo/extra/artifacts/_s3sts/sts.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from types_boto3_sts.client import STSClient + +from openeo.extra.artifacts._s3sts.config import S3STSConfig +from openeo.extra.artifacts._s3sts.model import AWSSTSCredentials +from openeo.extra.artifacts.exceptions import ProviderSpecificException +from openeo.rest.auth.auth import BearerAuth +from openeo.rest.connection import Connection +from openeo.util import Rfc3339 + + +class OpenEOSTSClient: + def __init__(self, config: S3STSConfig): + self.config = config + + def assume_from_openeo_connection(self, conn: Connection) -> AWSSTSCredentials: + """ + Takes an OpenEO connection object and returns temporary credentials to interact with S3 + """ + auth = conn.auth + assert auth is not None + if not isinstance(auth, BearerAuth): + raise ProviderSpecificException("Only connections that have BearerAuth can be used.") + auth_token = auth.bearer.split("/") + + return AWSSTSCredentials.from_assume_role_response( + self._get_sts_client().assume_role_with_web_identity( + RoleArn=self._get_aws_access_role(), + RoleSessionName=f"artifact-helper-{Rfc3339().now_utc()}", + WebIdentityToken=auth_token[2], + DurationSeconds=43200, + ) + ) + + def _get_sts_client(self) -> STSClient: + return self.config.build_client("sts") + + def _get_aws_access_role(self) -> str: + return self.config.role diff --git a/openeo/extra/artifacts/_s3sts/tracer.py b/openeo/extra/artifacts/_s3sts/tracer.py new file mode 100644 index 000000000..43e856053 --- /dev/null +++ b/openeo/extra/artifacts/_s3sts/tracer.py @@ -0,0 +1,31 @@ +import logging +from typing import Callable + +""" +The trace helps to pass on an X-Request-ID header value in all requests made by a +boto3 call. +""" + +TRACE_ID_KEY = "X-Request-ID" + + +def create_header_adder(request_id: str) -> Callable: + def add_request_id_header(request, **kwargs) -> None: + logger = logging.getLogger("openeo.extra.artifacts") + signature_version = kwargs.get("signature_version", "unknown") + if "query" in signature_version: + logger.debug("Do not add trace header for requests using query parameters instead of headers") + return + logger.debug("Adding trace id: {request_id}") + request.headers.add_header(TRACE_ID_KEY, request_id) + + return add_request_id_header + + +def add_trace_id(client, trace_id: str = "") -> None: + header_adder = create_header_adder(trace_id) + client.meta.events.register("before-sign.s3", header_adder) + + +def add_trace_id_as_query_parameter(url, trace_id: str) -> str: + return f"{url}&{TRACE_ID_KEY}={trace_id}" diff --git a/openeo/extra/artifacts/artifact_helper.py b/openeo/extra/artifacts/artifact_helper.py new file mode 100644 index 000000000..3538dce2e --- /dev/null +++ b/openeo/extra/artifacts/artifact_helper.py @@ -0,0 +1,50 @@ +from typing import Dict, Optional, Type + +from openeo import Connection +from openeo.extra.artifacts._s3sts.artifact_helper import S3STSArtifactHelper +from openeo.extra.artifacts._s3sts.config import S3STSConfig +from openeo.extra.artifacts.artifact_helper_abc import ( + ArtifactHelperABC, + ArtifactHelperBuilderABC, +) +from openeo.extra.artifacts.backend import ArtifactCapabilities +from openeo.extra.artifacts.config import StorageConfig +from openeo.extra.artifacts.exceptions import UnsupportedArtifactsType + +config_to_helper: Dict[Type[StorageConfig], Type[ArtifactHelperABC]] = {S3STSConfig: S3STSArtifactHelper} +config_type_to_helper: Dict[str, Type[ArtifactHelperABC]] = { + StorageConfig.get_type_from(cfg): helper for cfg, helper in config_to_helper.items() +} + +class ArtifactHelper(ArtifactHelperBuilderABC): + @classmethod + def from_openeo_connection(cls, conn: Connection, config: Optional[StorageConfig] = None) -> ArtifactHelperABC: + """ + Create an artifactHelper for an openEO backend. + + :param conn ``openeo.Connection`` connection to an openEOBackend + :param config: Optional object to specify configuration for Artifact storage + + :return: An Artifact helper based on info provided by the backend . + + Example usage: + ``` + from openeo.extra.artifacts import ArtifactHelper + + artifact_helper = ArtifactHelper.from_openeo_connection(connection) + storage_uri = artifact_helper.upload_file(object_name, src_file_path) + presigned_uri = artifact_helper.get_presigned_url(storage_uri) + ``` + """ + if config is None: + config_type = ArtifactCapabilities(conn).get_preferred_artifacts_provider().get_type() + else: + config_type = config.get_type() + + try: + artifact_helper = config_type_to_helper[config_type] + return artifact_helper.from_openeo_connection( + conn, ArtifactCapabilities(conn).get_preferred_artifacts_provider(), config=config + ) + except KeyError as ke: + raise UnsupportedArtifactsType(config_type) from ke diff --git a/openeo/extra/artifacts/artifact_helper_abc.py b/openeo/extra/artifacts/artifact_helper_abc.py new file mode 100644 index 000000000..66b217413 --- /dev/null +++ b/openeo/extra/artifacts/artifact_helper_abc.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +from openeo.extra.artifacts.backend import ProviderConfig +from openeo.extra.artifacts.config import StorageConfig +from openeo.extra.artifacts.uri import StorageURI +from openeo.rest.connection import Connection + + +class ArtifactHelperBuilderABC(ABC): + @classmethod + @abstractmethod + def from_openeo_connection(cls, conn: Connection, config: Optional[StorageConfig] = None) -> ArtifactHelperABC: + """ + Builder pattern, only used for implementing support for other Artifact stores. + """ + raise NotImplementedError("ArtifactHelperBuilders must have their own implementation") + + +class ArtifactHelperABC(ABC): + @classmethod + def from_openeo_connection( + cls, conn: Connection, provider_config: ProviderConfig, *, config: Optional[StorageConfig] = None + ) -> ArtifactHelperABC: + """ + Create a new Artifact helper from the OpenEO connection. This is the starting point to upload artifacts. + Each implementation has its own builder + """ + if config is None: + config = cls._get_default_storage_config() + config.load_connection_provided_config(provider_config) + return cls._from_openeo_connection(conn, config) + + @abstractmethod + def upload_file(self, path: str | Path, object_name: str = "") -> StorageURI: + """ + A method to store an artifact remotely and get a URI understandable by the OpenEO processor + """ + + @abstractmethod + def get_presigned_url(self, storage_uri: StorageURI, expires_in_seconds: int) -> str: + """ + A method to convert a StorageURI to a signed https URL which can be accessed via normal http libraries. + + These URIs should be kept secret as they provide access to the data. + """ + + def __init__(self, config: StorageConfig): + if not config.is_openeo_connection_metadata_loaded(): + raise RuntimeError("config should have openeo connection metadata loaded prior to initialization.") + self._config = config + + @classmethod + @abstractmethod + def _get_default_storage_config(cls) -> StorageConfig: + """ + A method that provides a default storage config for the Artifact Helper + """ + + @classmethod + @abstractmethod + def _from_openeo_connection(cls, conn: Connection, config: StorageConfig) -> ArtifactHelperABC: + """ + The implementation that creates an artifact helper. This method takes a config which has already been + initialized from the metadata of the OpenEO connection. + """ diff --git a/openeo/extra/artifacts/backend.py b/openeo/extra/artifacts/backend.py new file mode 100644 index 000000000..d82d029a8 --- /dev/null +++ b/openeo/extra/artifacts/backend.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional, TypedDict + +from openeo import Connection +from openeo.extra.artifacts.exceptions import ( + ArtifactsException, + InvalidProviderConfig, + NoAdvertisedProviders, + NoDefaultConfig, +) + +_capabilities_cache: Dict[str, Dict] = {} +_log = logging.getLogger(__name__) + + +class TProviderConfig(TypedDict): + """The ID of the provider config not used in MVP""" + + id: str + """The type of artifacts storage which defines how to interact with the storage""" + type: str + """The config for the artifacts storage this will depend on the type""" + config: Dict[str, Any] + + +class ProviderConfig: + """ + Configuration as provided by the OpenEO backend. + It holds an exception if no such configuration is retrievable. + """ + + def __init__(self, id: str, type: str, config: dict, *, exc: Optional[Exception] = None): + self.id = id + self.type = type + self.config: dict = config + self.exc: Exception = exc + + @classmethod + def from_typed_dict(cls, d: TProviderConfig) -> ProviderConfig: + try: + return cls( + id=d["id"], + type=d["type"], + config=d["config"], + ) + except KeyError as ke: + raise InvalidProviderConfig("Provider config needs id, type and config fields.") from ke + + @classmethod + def from_exception(cls, exc: Exception) -> ProviderConfig: + return cls(id="undefined", type="undefined", config={}, exc=exc) + + def raise_if_needed(self, operation: str): + """Check if operation can be done if not raise an exception""" + if self.exc is not None: + _log.error(f"Trying to {operation} for backend config which was not available") + raise self.exc + + def get_key(self, key: str) -> Any: + self.raise_if_needed(f"get key {key}") + try: + return self.config[key] + except KeyError as ke: + raise NoDefaultConfig(key) from ke + + def get_type(self) -> str: + self.raise_if_needed("get type") + return self.type + + def __getitem__(self, key): + return self.get_key(key) + + + +class ProvidersConfig(TypedDict): + providers: List[TProviderConfig] + + +class TArtifactsCapabilty(TypedDict): + artifacts: TypedDict + + +class ArtifactCapabilities: + def __init__(self, conn: Connection): + self.conn = conn + + def _get_artifacts_capabilities(self) -> ProvidersConfig: + """ + Get the artifacts capabilities corresponding to the OpenEO connection + """ + url = self.conn.root_url + if url not in _capabilities_cache: + try: + _capabilities_cache[url] = self.conn.get("/").json()["artifacts"] + except KeyError: + raise NoAdvertisedProviders() + return _capabilities_cache[url] + + def _get_artifacts_providers(self) -> List[TProviderConfig]: + try: + return self._get_artifacts_capabilities()["providers"] + except KeyError as e: + raise NoAdvertisedProviders() from e + + def get_preferred_artifacts_provider(self) -> ProviderConfig: + try: + return ProviderConfig.from_typed_dict(self._get_artifacts_providers()[0]) + except IndexError: + return ProviderConfig("id", "type", {}, exc=NoAdvertisedProviders()) + except ArtifactsException as e: + return ProviderConfig("id", "type", {}, exc=e) diff --git a/openeo/extra/artifacts/config.py b/openeo/extra/artifacts/config.py new file mode 100644 index 000000000..76e938b14 --- /dev/null +++ b/openeo/extra/artifacts/config.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +from openeo.extra.artifacts.backend import ProviderConfig + +_METADATA_LOADED = "_sc_metadata_loaded" + + +class StorageConfig(ABC): + """ + Storage config allows overriding configuration for the interaction with the backend storage. + It greatly depends on the type of storage so the enforced API is limited to load metadata using the connection. + """ + @abstractmethod + def _load_connection_provided_config(self, provider_config: ProviderConfig) -> None: + """ + Implementations implement their logic of adapting config based on metadata from the current OpenEO connection. + + The config depends on the storage type so here it can deal with settings specific for this type of storage. + """ + + @staticmethod + def get_type_from(cls: Any) -> str: + """The type is the name of the implementing class so that is the first object used for method resolution""" + return cls.__mro__[0].__name__ + + @classmethod + def get_type(cls) -> str: + """Return the storage config type""" + return cls.get_type_from(cls) + + def load_connection_provided_config(self, provider_config: ProviderConfig) -> None: + """ + This is the method that is actually used to load metadata. Metadata is only loaded once. + """ + if not self.is_openeo_connection_metadata_loaded(): + self._load_connection_provided_config(provider_config) + object.__setattr__(self, _METADATA_LOADED, True) + + def is_openeo_connection_metadata_loaded(self) -> bool: + """ + This is a helper to check whether metadata is loaded. + """ + if not hasattr(self, _METADATA_LOADED): + return False + return getattr(self, _METADATA_LOADED) diff --git a/openeo/extra/artifacts/exceptions.py b/openeo/extra/artifacts/exceptions.py new file mode 100644 index 000000000..d6f99900c --- /dev/null +++ b/openeo/extra/artifacts/exceptions.py @@ -0,0 +1,42 @@ +class ArtifactsException(Exception): + """ + Family of exceptions related to artifacts + """ + + +class NoAdvertisedProviders(ArtifactsException): + """ + The providers list is empty + """ + + def __str__(self): + return """ +The OpenEO backend used does not advertise providers for managing artifacts. +""".lstrip() + + +class UnsupportedArtifactsType(ArtifactsException): + """ + The artifacts type is not supported + """ + + def __init__(self, type_id: str): + self.type_id = type_id + + def __str__(self): + return f"The OpenEO backend does not support {self.type_id}" + + +class NoDefaultConfig(ArtifactsException): + def __init__(self, key: str): + self.key = key + + def __str__(self): + return f"There was no default config provided by backend for {self.key}" + + +class InvalidProviderConfig(ArtifactsException): + """The backend has an invalid provider config. This must be fixed by the provider of the backend.""" + + +class ProviderSpecificException(ArtifactsException): ... diff --git a/openeo/extra/artifacts/uri.py b/openeo/extra/artifacts/uri.py new file mode 100644 index 000000000..1bbdcf9b1 --- /dev/null +++ b/openeo/extra/artifacts/uri.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class StorageURI(ABC): + """A URI that is specific to a storage backend. The protocol determines what this URL looks like""" + @classmethod + @abstractmethod + def from_str(cls, uri: str) -> StorageURI: + """factory method to create a typed object from its string representation""" + + def __str__(self): + return self.to_string() + + @abstractmethod + def to_string(self) -> str: + raise NotImplementedError("Implementation must implement explicit handling.") \ No newline at end of file diff --git a/setup.py b/setup.py index 9a4770793..0ead90a9b 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,13 @@ "ipython", ] +artifacts_require = [ + "boto3", + "botocore" +] + +typing_requires = ["types-boto3-s3", "types-boto3-sts"] + name = "openeo" setup( name=name, @@ -84,14 +91,15 @@ "importlib_resources; python_version<'3.9'", ], extras_require={ - "tests": tests_require, - "dev": tests_require + docs_require, + "tests": tests_require + artifacts_require, + "dev": tests_require + docs_require + typing_requires + artifacts_require, "docs": docs_require, "oschmod": [ # install oschmod even when platform is not Windows, e.g. for testing in CI. "oschmod>=0.3.12" ], "localprocessing": localprocessing_require, "jupyter": jupyter_require, + "artifacts": artifacts_require, }, entry_points={ "console_scripts": ["openeo-auth=openeo.rest.auth.cli:main"], diff --git a/tests/extra/artifacts/__init__.py b/tests/extra/artifacts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extra/artifacts/conftest.py b/tests/extra/artifacts/conftest.py new file mode 100644 index 000000000..477ded327 --- /dev/null +++ b/tests/extra/artifacts/conftest.py @@ -0,0 +1,29 @@ +from typing import Iterator + +import pytest + +from openeo import Connection +from tests.rest.conftest import API_URL + + +@pytest.fixture +def extra_api_capabilities() -> dict: + """ + Fixture to be overridden for customizing the capabilities doc used by connection fixtures. + To be used as kwargs for `build_capabilities` + """ + return {} + + +@pytest.fixture +def conn_with_extra_capabilities(requests_mock, extra_api_capabilities) -> Iterator[Connection]: + requests_mock.get(API_URL, json={"api_version": "1.0.0", **extra_api_capabilities}) + yield Connection(API_URL) + + +@pytest.fixture +def clean_capabilities_cache() -> Iterator[None]: + from openeo.extra.artifacts.backend import _capabilities_cache + + _capabilities_cache.clear() + yield diff --git a/tests/extra/artifacts/internal_s3/__init__.py b/tests/extra/artifacts/internal_s3/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extra/artifacts/internal_s3/test_config.py b/tests/extra/artifacts/internal_s3/test_config.py new file mode 100644 index 000000000..a87499aea --- /dev/null +++ b/tests/extra/artifacts/internal_s3/test_config.py @@ -0,0 +1,34 @@ +import pytest + +from openeo.extra.artifacts._s3sts.config import S3STSConfig + + +@pytest.mark.parametrize("s3_endpoint_uri,expected", [ + ("https://s3.us-east-2.amazonaws.com", "us-east-2"), + ("https://s3.dualstack.us-east-2.amazonaws.com", "us-east-2"), + ("https://s3-fips.dualstack.us-east-2.amazonaws.com", "us-east-2"), + ("https://s3-fips.us-east-2.amazonaws.com", "us-east-2"), + ("https://s3.waw3-1.openeo.v1.dataspace.copernicus.eu", "waw3-1"), + ("https://s3.prod.waw3-1.openeo.v1.dataspace.copernicus.eu", "waw3-1"), + ("https://s3.stag.waw3-1.openeo.v1.dataspace.copernicus.eu", "waw3-1"), + ("https://s3.dev.waw3-1.openeo.v1.dataspace.copernicus.eu", "waw3-1") +]) +def test_region_should_be_derived_correctly(s3_endpoint_uri: str, expected: str): + # GIVEN config with a certain endpoint + config = S3STSConfig(s3_endpoint=s3_endpoint_uri) + + # WHEN a region is extracted + region = config._get_storage_region() + + # THEN is is the expected + assert region == expected, f"Got {region}, expected {expected}" + + +def test_extracting_region_from_invalid_url_must_give_value_error(): + # GIVEN a config with invalid url + config = S3STSConfig(s3_endpoint="https://www.google.com") + + # WHEN a region is extracted + # THEN a Value error is raised + with pytest.raises(ValueError): + config._get_storage_region() diff --git a/tests/extra/artifacts/internal_s3/test_s3sts.py b/tests/extra/artifacts/internal_s3/test_s3sts.py new file mode 100644 index 000000000..0360c6399 --- /dev/null +++ b/tests/extra/artifacts/internal_s3/test_s3sts.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import dataclasses +import datetime +from typing import TYPE_CHECKING, Iterator +from unittest.mock import Mock + +import pytest + +if TYPE_CHECKING: + from types_boto3_sts.type_defs import AssumeRoleWithWebIdentityResponseTypeDef + +from openeo import Connection +from openeo.extra.artifacts import ArtifactHelper +from openeo.extra.artifacts._s3sts.artifact_helper import S3STSArtifactHelper +from openeo.extra.artifacts._s3sts.config import S3STSConfig +from openeo.extra.artifacts._s3sts.sts import OpenEOSTSClient +from openeo.rest.auth.auth import BearerAuth +from tests.rest.conftest import API_URL + +fake_creds_response: AssumeRoleWithWebIdentityResponseTypeDef = { + "Credentials": { + "AccessKeyId": "akid", + "SecretAccessKey": "secret", + "SessionToken": "token", + # TODO: go for datetime.datetime.now(tz=datetime.UTC) once 3.10 support is no longer needed + "Expiration": datetime.datetime.utcnow() + datetime.timedelta(hours=1), + }, + "SubjectFromWebIdentityToken": "tokensubject", + "AssumedRoleUser": {"AssumedRoleId": "1", "Arn": "not important"}, + "PackedPolicySize": 10, + "Provider": "notImportant", + "Audience": "notImportant", + "SourceIdentity": "", + "ResponseMetadata": { + "RequestId": "0000-00", + "HTTPStatusCode": 200, + "HTTPHeaders": {}, + "RetryAttempts": 0, + "HostId": "1", + }, +} + + +@pytest.fixture +def mocked_sts(monkeypatch): + mocked_sts_client = Mock(["assume_role_with_web_identity"]) + mocked_sts_client.assume_role_with_web_identity.return_value = fake_creds_response + monkeypatch.setattr(OpenEOSTSClient, "_get_sts_client", Mock(return_value=mocked_sts_client)) + yield mocked_sts_client + + +test_p_bucket_name = "openeo-artifacts" +test_p_role_arn = "arn:aws:iam::000000000000:role/S3Access" +test_p_s3_endpoint = "https://s3.oeo.test" +test_p_sts_endpoint = "https://sts.oeo.test" +test_p_config = { + "bucket": test_p_bucket_name, + "role": test_p_role_arn, + "s3_endpoint": test_p_s3_endpoint, + "sts_endpoint": test_p_sts_endpoint, +} + + +@pytest.fixture +def conn_with_stss3_capabilities(requests_mock, extra_api_capabilities) -> Iterator[Connection]: + extra_api_capabilities = { + "artifacts": {"providers": [{"config": test_p_config, "id": "s3", "type": "S3STSConfig"}]} + } + requests_mock.get(API_URL, json={"api_version": "1.0.0", **extra_api_capabilities}) + conn = Connection(API_URL) + conn.auth = BearerAuth("oidc/fake/token") + yield conn + + +def test_backend_provided_settings_s3sts(clean_capabilities_cache, conn_with_stss3_capabilities, mocked_sts): + # Given a backend that exposes stss3 capabilities (fixture) + # When creating an artifacthelper without specifying config + ah = ArtifactHelper.from_openeo_connection(conn_with_stss3_capabilities, None) + # Then the artifact helper is of the expected instance + assert isinstance(ah, S3STSArtifactHelper) + # Then the config is of the expected type + assert isinstance(ah.config, S3STSConfig) + # And the config contains the backend provided settings + assert test_p_bucket_name == ah.config.bucket + assert test_p_role_arn == ah.config.role + assert test_p_sts_endpoint == ah.config.sts_endpoint + assert test_p_s3_endpoint == ah.config.s3_endpoint + + +# Custom overrides +test_c_bucket_name = "openeo-custom-artifacts" +test_c_role_arn = "arn:aws:iam::000000000000:role/S3Artifacts" +test_c_s3_endpoint = "https://s3.oeo2.test" +test_c_sts_endpoint = "https://sts.oeo2.test" + + +def get_stss3_config_default_field_values() -> dict: + c = S3STSConfig() + return {field_name.name: getattr(c, field_name.name) for field_name in dataclasses.fields(c)} + + +@pytest.mark.parametrize( + "overrides", + [ + { + "s3_endpoint": test_c_s3_endpoint, + "sts_endpoint": test_c_sts_endpoint, + "role": test_c_role_arn, + "bucket": test_c_bucket_name, + } + ], +) +def test_config_overrides_take_precedence( + clean_capabilities_cache, conn_with_stss3_capabilities, mocked_sts, overrides: dict +): + defaults = get_stss3_config_default_field_values() + expected_values = {**defaults, **test_p_config, **overrides} + + # Given a backend that exposes stss3 capabilities (fixture) + # When creating an artifacthelper without specifying config + ah = ArtifactHelper.from_openeo_connection(conn_with_stss3_capabilities, S3STSConfig(**overrides)) + # Then the artifact helper is of the expected instance + assert isinstance(ah, S3STSArtifactHelper) + # Then the config is of the expected type + assert isinstance(ah.config, S3STSConfig) + # And the config contains the backend provided settings + for field_name in dataclasses.fields(ah.config): + assert expected_values[field_name.name] == getattr(ah.config, field_name.name) diff --git a/tests/extra/artifacts/internal_s3/test_s3uri.py b/tests/extra/artifacts/internal_s3/test_s3uri.py new file mode 100644 index 000000000..b320a96da --- /dev/null +++ b/tests/extra/artifacts/internal_s3/test_s3uri.py @@ -0,0 +1,12 @@ +from openeo.extra.artifacts._s3sts.model import S3URI + + +def test_s3uri_serialization_is_idempotent(): + # GIVEN an S3 URI + my_s3_uri = "s3://mybucket/my-key1" + + # WHEN we convert it to an S3URI object + s3_obj = S3URI.from_str(my_s3_uri) + + # THEN getting the string value must result in same + assert str(s3_obj) == my_s3_uri diff --git a/tests/extra/artifacts/test_artifact_helper.py b/tests/extra/artifacts/test_artifact_helper.py new file mode 100644 index 000000000..020f0fad7 --- /dev/null +++ b/tests/extra/artifacts/test_artifact_helper.py @@ -0,0 +1,110 @@ +from typing import Type + +import pytest + +from openeo.extra.artifacts import ArtifactHelper +from openeo.extra.artifacts.exceptions import ( + NoAdvertisedProviders, + ProviderSpecificException, + UnsupportedArtifactsType, +) + + +@pytest.mark.parametrize( + ( + "description", + "extra_api_capabilities", + "expected_ex", + ), + [ + [ + """When using the new PresignedS3AssetUrls with ipd in place but not required config in place we should + be on old behavior""", + {}, # No extra capabilities + NoAdvertisedProviders, + ], + [ + "When the backend advertises an unsupported provider it should raise an exception", + { + "artifacts": { + "providers": [ + { + "config": { + "bucket": "openeo-artifacts", + "role": "arn:aws:iam::000000000000:role/S3Access", + "s3_endpoint": "https://s3.oeo.test", + "sts_endpoint": "https://sts.oeo.test", + }, + "id": "s3", + "type": "NonExistingStorageProvider", + } + ] + } + }, + UnsupportedArtifactsType, + ], + [ + """When using the S3STS provider a connection requires to have authentication this is not present in this + test mocking""", + { + "artifacts": { + "providers": [ + { + "config": { + "bucket": "openeo-artifacts", + "role": "arn:aws:iam::000000000000:role/S3Access", + "s3_endpoint": "https://s3.oeo.test", + "sts_endpoint": "https://sts.oeo.test", + }, + "id": "s3", + "type": "S3STSConfig", + } + ] + } + }, + ProviderSpecificException, + ], + [ + """An artifacts section without providers must raise no providers exception""", + {"artifacts": {}}, + NoAdvertisedProviders, + ], + [ + """An artifacts section with an empty providers list must raise no providers exception""", + {"artifacts": {"providers": []}}, + NoAdvertisedProviders, + ], + [ + """When using the S3STS provider a connection requires to have authentication this is not present in this + test mocking""", + { + "artifacts": { + "providers": [ + { + "config": { + "bucket": "openeo-artifacts", + "role": "arn:aws:iam::000000000000:role/S3Access", + "s3_endpoint": "https://s3.oeo.test", + "sts_endpoint": "https://sts.oeo.test", + }, + "id": "s3", + "type": "S3STSConfig", + } + ] + } + }, + ProviderSpecificException, + ], + ], +) +def test_artifacts_raising_exceptions_when_required( + clean_capabilities_cache, + conn_with_extra_capabilities, + description: str, + extra_api_capabilities: dict, + expected_ex: Type[BaseException], +): + # Given no provided config then the client should raise exceptions when there is no appropriate way of + # configuring an Artifact helper + with pytest.raises(expected_ex): + ArtifactHelper.from_openeo_connection(conn_with_extra_capabilities, None)