From 9ceaf6eb0007ebf36918dbb0e4055a07486d2444 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 13:19:45 +0200 Subject: [PATCH 01/26] add CI for pytests and code style checks --- .flake8 | 14 ++++++ .github/workflows/test-ingest-python-code.yml | 49 +++++++++++++++++++ scripts/automated_ingestion/pytest.sh | 10 ++++ 3 files changed, 73 insertions(+) create mode 100644 .flake8 create mode 100644 .github/workflows/test-ingest-python-code.yml create mode 100644 scripts/automated_ingestion/pytest.sh diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b6b309e3 --- /dev/null +++ b/.flake8 @@ -0,0 +1,14 @@ +# This file is part of the EESSI filesystem layer, +# see https://github.com/EESSI/filesystem-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +[flake8] +max-line-length = 120 + +# ignore "Black would make changes" produced by flake8-black +# see also https://github.com/houndci/hound/issues/1769 +extend-ignore = BLK100 diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml new file mode 100644 index 00000000..3ed0e692 --- /dev/null +++ b/.github/workflows/test-ingest-python-code.yml @@ -0,0 +1,49 @@ +# This file is part of the EESSI filesystem layer, +# see https://github.com/EESSI/filesystem-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +name: Run tests +on: [push, pull_request] +# Declare default permissions as read only. +permissions: read-all +jobs: + test: + runs-on: ubuntu-24.04 + strategy: + matrix: + # for now, only test with Python 3.9+ (since we're testing in Ubuntu 24.04) + #python: [3.6, 3.7, 3.8, 3.9, '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] + fail-fast: false + steps: + - name: checkout + uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + + - name: set up Python + uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0 + with: + python-version: ${{matrix.python}} + + - name: Install required Python packages + pytest + flake8 + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + python -m pip install pytest + python -m pip install --upgrade flake8 + + - name: Run test suite (without coverage) + run: | + ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -verbose + + - name: Run test suite (with coverage) + run: | + python -m pip install pytest-cov + ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=$PWD + + - name: Run flake8 to verify PEP8-compliance of Python code + run: | + flake8 scripts/automated_ingestion \ No newline at end of file diff --git a/scripts/automated_ingestion/pytest.sh b/scripts/automated_ingestion/pytest.sh new file mode 100644 index 00000000..f8b4e170 --- /dev/null +++ b/scripts/automated_ingestion/pytest.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# +# This file is part of the EESSI filesystem layer, +# see https://github.com/EESSI/filesystem-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# +PYTHONPATH=$PWD:$PYTHONPATH pytest --capture=no "$@" \ No newline at end of file From e974676f6d3fbbf5c13c0467b9ab4cb98ba87e6a Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 13:23:36 +0200 Subject: [PATCH 02/26] exclude existing *.py files from flake8 tests --- .github/workflows/test-ingest-python-code.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml index 3ed0e692..da6d28d6 100644 --- a/.github/workflows/test-ingest-python-code.yml +++ b/.github/workflows/test-ingest-python-code.yml @@ -46,4 +46,4 @@ jobs: - name: Run flake8 to verify PEP8-compliance of Python code run: | - flake8 scripts/automated_ingestion \ No newline at end of file + flake8 scripts/automated_ingestion --exclude=scripts/automated_ingestion/automated_ingestion.py,scripts/automated_ingestion/eessitarball.py \ No newline at end of file From c213d704c67bb117e6b5f9bd4c533ca0bb5a4e55 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 13:24:23 +0200 Subject: [PATCH 03/26] change permission for pytest.sh script --- scripts/automated_ingestion/pytest.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/automated_ingestion/pytest.sh diff --git a/scripts/automated_ingestion/pytest.sh b/scripts/automated_ingestion/pytest.sh old mode 100644 new mode 100755 From cfce28d39b8d0f8ef64dfcddd7239283de35a5b4 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 13:26:31 +0200 Subject: [PATCH 04/26] logging functions in separate module --- scripts/automated_ingestion/eessi_logging.py | 246 +++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 scripts/automated_ingestion/eessi_logging.py diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py new file mode 100644 index 00000000..ab5947c8 --- /dev/null +++ b/scripts/automated_ingestion/eessi_logging.py @@ -0,0 +1,246 @@ +import functools +import inspect +import logging +import os +import sys +import time + +from enum import IntFlag, auto +from typing import Callable, Union + + +class LoggingScope(IntFlag): + """Enumeration of different logging scopes.""" + NONE = 0 + FUNC_ENTRY_EXIT = auto() # Function entry/exit logging + DOWNLOAD = auto() # Logging related to file downloads + VERIFICATION = auto() # Logging related to signature and checksum verification + STATE_OPS = auto() # Logging related to tarball state operations + GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) + GROUP_OPS = auto() # Logging related to tarball group operations + TASK_OPS = auto() # Logging related to task operations + ERROR = auto() # Error logging (separate from other scopes for easier filtering) + DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) + ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS | + GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG) + + +# Global setting for logging scopes +ENABLED_LOGGING_SCOPES = LoggingScope.NONE + + +# Global variable to track call stack depth +_call_stack_depth = 0 + + +def is_logging_scope_enabled(scope: LoggingScope) -> bool: + """Check if a specific logging scope is enabled.""" + return bool(ENABLED_LOGGING_SCOPES & scope) + + +def log_function_entry_exit(logger: logging.Logger = None) -> Callable: + """ + Decorator that logs function entry and exit with timing information. + Only logs if the FUNC_ENTRY_EXIT scope is enabled. + + Args: + logger: Optional logger instance. If not provided, uses the module's logger. + """ + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + global _call_stack_depth + + if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT): + return func(*args, **kwargs) + + if logger is None: + log = logging.getLogger(func.__module__) + else: + log = logger + + # Get context information if available + context = "" + if len(args) > 0 and hasattr(args[0], 'object'): + # For EessiTarball methods, show the tarball name and state + tarball = args[0] + filename = os.path.basename(tarball.object) + + # Format filename to show important parts + if len(filename) > 30: + parts = filename.split('-') + if len(parts) >= 6: # Ensure we have all required parts + # Get version, component, last part of architecture, and epoch + version = parts[1] + component = parts[2] + arch_last = parts[-2].split('-')[-1] # Last part of architecture + epoch = parts[-1] # includes file extension + filename = f"{version}-{component}-{arch_last}-{epoch}" + else: + # Fallback to simple truncation if format doesn't match + filename = f"{filename[:15]}...{filename[-12:]}" + + context = f" [{filename}" + if hasattr(tarball, 'state'): + context += f" in {tarball.state}" + context += "]" + + # Create indentation based on call stack depth + indent = " " * _call_stack_depth + + # Get file name and line number where the function is defined + file_name = os.path.basename(inspect.getsourcefile(func)) + source_lines, start_line = inspect.getsourcelines(func) + # Find the line with the actual function definition + def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def ')) + def_line_no = start_line + def_line + # Find the last non-empty line of the function + last_line = next(i for i, line in enumerate(reversed(source_lines)) if line.strip()) + last_line_no = start_line + len(source_lines) - 1 - last_line + + start_time = time.time() + log.info(f"{indent}[FUNC_ENTRY_EXIT] Entering {func.__name__} at {file_name}:{def_line_no}{context}") + _call_stack_depth += 1 + try: + result = func(*args, **kwargs) + _call_stack_depth -= 1 + end_time = time.time() + # For normal returns, show the last line of the function + log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}" + f"{context} (took {end_time - start_time:.2f}s)") + return result + except Exception as err: + _call_stack_depth -= 1 + end_time = time.time() + # For exceptions, try to get the line number from the exception + try: + exc_line_no = err.__traceback__.tb_lineno + except AttributeError: + exc_line_no = last_line_no + log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}" + f"{context} with exception (took {end_time - start_time:.2f}s)") + raise err + return wrapper + return decorator + + +def log_message(scope, level, msg, *args, logger=None, **kwargs): + """ + Log a message if either: + 1. The specified scope is enabled, OR + 2. The current log level is equal to or higher than the specified level + + Args: + scope: LoggingScope value indicating which scope this logging belongs to + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + msg: Message to log + logger: Optional logger instance. If not provided, uses the root logger. + *args, **kwargs: Additional arguments to pass to the logging function + """ + log = logger or logging.getLogger() + log_level = getattr(logging, level.upper()) + + # Check if either condition is met + if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()): + return + + # Create indentation based on call stack depth + indent = " " * _call_stack_depth + # Add scope to the message + scoped_msg = f"[{scope.name}] {msg}" + indented_msg = f"{indent}{scoped_msg}" + + # If scope is enabled, use the temporary handler + if is_logging_scope_enabled(scope): + # Save original handlers + original_handlers = list(log.handlers) + + # Create a temporary handler that accepts all levels + temp_handler = logging.StreamHandler(sys.stdout) + temp_handler.setLevel(logging.DEBUG) + temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s')) + + try: + # Remove existing handlers temporarily + for handler in original_handlers: + log.removeHandler(handler) + + # Add temporary handler + log.addHandler(temp_handler) + + # Log the message + log_func = getattr(log, level.lower()) + log_func(indented_msg, *args, **kwargs) + finally: + log.removeHandler(temp_handler) + # Restore original handlers + for handler in original_handlers: + if handler not in log.handlers: + log.addHandler(handler) + # Only use normal logging if scope is not enabled AND level is high enough + elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel(): + # Use normal logging with level check + log_func = getattr(log, level.lower()) + log_func(indented_msg, *args, **kwargs) + + +def set_logging_scopes(scopes: Union[LoggingScope, str, list[str]]) -> None: + """ + Set the enabled logging scopes. + + Args: + scopes: Can be + - A LoggingScope value + - A string with comma-separated values using +/- syntax: + - "+SCOPE" to enable a scope + - "-SCOPE" to disable a scope + - "ALL" or "+ALL" to enable all scopes + - "-ALL" to disable all scopes + Examples: + "+FUNC_ENTRY_EXIT" # Enable only function entry/exit + "+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" # Enable function entry/exit but disable example + "+ALL,-FUNC_ENTRY_EXIT" # Enable all scopes except function entry/exit + """ + global ENABLED_LOGGING_SCOPES + + if isinstance(scopes, LoggingScope): + ENABLED_LOGGING_SCOPES = scopes + return + + if isinstance(scopes, str): + # Start with no scopes enabled + ENABLED_LOGGING_SCOPES = LoggingScope.NONE + + # Split into individual scope specifications + scope_specs = [s.strip() for s in scopes.split(",")] + + for spec in scope_specs: + if not spec: + continue + + # Check for ALL special case + if spec.upper() in ["ALL", "+ALL"]: + ENABLED_LOGGING_SCOPES = LoggingScope.ALL + continue + elif spec.upper() == "-ALL": + ENABLED_LOGGING_SCOPES = LoggingScope.NONE + continue + + # Parse scope name and operation + operation = spec[0] + scope_name = spec[1:].strip().upper() + + try: + scope_enum = LoggingScope[scope_name] + if operation == '+': + ENABLED_LOGGING_SCOPES |= scope_enum + elif operation == '-': + ENABLED_LOGGING_SCOPES &= ~scope_enum + else: + logging.warning(f"Invalid operation '{operation}' in scope specification: {spec}") + except KeyError: + logging.warning(f"Unknown logging scope: {scope_name}") + + elif isinstance(scopes, list): + # Convert list to comma-separated string and process + set_logging_scopes(",".join(scopes)) From 1d2c17f3db773c75789d91b1b091635763a9fda4 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 13:53:17 +0200 Subject: [PATCH 05/26] add relative path to requirements.txt --- .github/workflows/test-ingest-python-code.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml index da6d28d6..c0fc979c 100644 --- a/.github/workflows/test-ingest-python-code.yml +++ b/.github/workflows/test-ingest-python-code.yml @@ -31,7 +31,7 @@ jobs: - name: Install required Python packages + pytest + flake8 run: | python -m pip install --upgrade pip - python -m pip install -r requirements.txt + python -m pip install -r scripts/automated_ingestion/requirements.txt python -m pip install pytest python -m pip install --upgrade flake8 From 8d0bb3527908d1703d97a124476a6fc9abcb46c3 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 14:19:20 +0200 Subject: [PATCH 06/26] fix test issues and limit coverage to new python code --- .github/workflows/test-ingest-python-code.yml | 4 ++-- .gitignore | 1 + scripts/automated_ingestion/.coveragerc | 5 +++++ 3 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 scripts/automated_ingestion/.coveragerc diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml index c0fc979c..9e341783 100644 --- a/.github/workflows/test-ingest-python-code.yml +++ b/.github/workflows/test-ingest-python-code.yml @@ -37,12 +37,12 @@ jobs: - name: Run test suite (without coverage) run: | - ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -verbose + ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion --verbose - name: Run test suite (with coverage) run: | python -m pip install pytest-cov - ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=$PWD + ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=scripts/automated_ingestion/eessi_logging.py - name: Run flake8 to verify PEP8-compliance of Python code run: | diff --git a/.gitignore b/.gitignore index 39af2bac..7789e614 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build hosts +.coverage diff --git a/scripts/automated_ingestion/.coveragerc b/scripts/automated_ingestion/.coveragerc new file mode 100644 index 00000000..ec1f100a --- /dev/null +++ b/scripts/automated_ingestion/.coveragerc @@ -0,0 +1,5 @@ +[run] +omit = + scripts/automated_ingestion/automated_ingestion.py + scripts/automated_ingestion/eessitarball.py + scripts/automated_ingestion/utils.py From 0ede3a8d2a8d5918a9eb69542bf44068847e3ca3 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 15:28:08 +0200 Subject: [PATCH 07/26] add basic tests to avoid coverage errors --- .gitignore | 1 + .../unit_tests/__init__.py | 1 + .../unit_tests/test_basic.py | 27 +++++++++++++++++++ 3 files changed, 29 insertions(+) create mode 100644 scripts/automated_ingestion/unit_tests/__init__.py create mode 100644 scripts/automated_ingestion/unit_tests/test_basic.py diff --git a/.gitignore b/.gitignore index 7789e614..893c00e4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ build hosts .coverage +**/__pycache__ diff --git a/scripts/automated_ingestion/unit_tests/__init__.py b/scripts/automated_ingestion/unit_tests/__init__.py new file mode 100644 index 00000000..467d5dfe --- /dev/null +++ b/scripts/automated_ingestion/unit_tests/__init__.py @@ -0,0 +1 @@ +# This file makes the unit_tests directory a Python package diff --git a/scripts/automated_ingestion/unit_tests/test_basic.py b/scripts/automated_ingestion/unit_tests/test_basic.py new file mode 100644 index 00000000..7e382cbd --- /dev/null +++ b/scripts/automated_ingestion/unit_tests/test_basic.py @@ -0,0 +1,27 @@ +""" +Basic test file to prevent pytest from failing with exit code 5 when no tests are found. + +This file is part of the EESSI filesystem layer, +see https://github.com/EESSI/filesystem-layer + +author: Thomas Roeblitz (@trz42) + +license: GPLv2 +""" + +import pytest + + +def test_basic_placeholder(): + """Basic placeholder test that always passes.""" + assert True + + +def test_import_modules(): + """Test that we can import the main modules without errors.""" + try: + import eessi_logging + # Verify the modules were imported successfully + assert eessi_logging is not None + except ImportError as err: + pytest.skip(f"Module import failed: {err}") From 712c48c1c3cb043b6026a5ac7d8a11cada0da964 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 15:30:26 +0200 Subject: [PATCH 08/26] skip flake8 for existing files and unit tests --- .flake8 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index b6b309e3..15beaa74 100644 --- a/.flake8 +++ b/.flake8 @@ -7,8 +7,15 @@ # [flake8] -max-line-length = 120 +exclude = + scripts/check-stratum-servers.py, + scripts/automated_ingestion/automated_ingestion.py, + scripts/automated_ingestion/eessitarball.py, + scripts/automated_ingestion/utils.py, + scripts/automated_ingestion/unit_tests/*.py # ignore "Black would make changes" produced by flake8-black # see also https://github.com/houndci/hound/issues/1769 extend-ignore = BLK100 + +max-line-length = 120 From 7c7d673e571382b162e32a55bb5d44c5009aa4a5 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 15:30:49 +0200 Subject: [PATCH 09/26] skip coverage for unit tests --- scripts/automated_ingestion/.coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/.coveragerc b/scripts/automated_ingestion/.coveragerc index ec1f100a..2941a1ed 100644 --- a/scripts/automated_ingestion/.coveragerc +++ b/scripts/automated_ingestion/.coveragerc @@ -3,3 +3,4 @@ omit = scripts/automated_ingestion/automated_ingestion.py scripts/automated_ingestion/eessitarball.py scripts/automated_ingestion/utils.py + scripts/automated_ingestion/unit_tests/*.py From eea35ae74e7fbdedd1c9d1c6428ae0f37269679d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 15:42:35 +0200 Subject: [PATCH 10/26] include unit tests in flake8 run --- .flake8 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.flake8 b/.flake8 index 15beaa74..852c375b 100644 --- a/.flake8 +++ b/.flake8 @@ -11,8 +11,7 @@ exclude = scripts/check-stratum-servers.py, scripts/automated_ingestion/automated_ingestion.py, scripts/automated_ingestion/eessitarball.py, - scripts/automated_ingestion/utils.py, - scripts/automated_ingestion/unit_tests/*.py + scripts/automated_ingestion/utils.py # ignore "Black would make changes" produced by flake8-black # see also https://github.com/houndci/hound/issues/1769 From 34180df14371e98132e5302d17fc6c2440d7e9eb Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 15:53:49 +0200 Subject: [PATCH 11/26] move LOG_LEVELS and error func to logging module --- scripts/automated_ingestion/eessi_logging.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py index ab5947c8..857d92ca 100644 --- a/scripts/automated_ingestion/eessi_logging.py +++ b/scripts/automated_ingestion/eessi_logging.py @@ -9,6 +9,15 @@ from typing import Callable, Union +LOG_LEVELS = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL +} + + class LoggingScope(IntFlag): """Enumeration of different logging scopes.""" NONE = 0 @@ -33,6 +42,12 @@ class LoggingScope(IntFlag): _call_stack_depth = 0 +def error(msg, code=1): + """Print an error and exit.""" + log_message(LoggingScope.ERROR, 'ERROR', msg) + sys.exit(code) + + def is_logging_scope_enabled(scope: LoggingScope) -> bool: """Check if a specific logging scope is enabled.""" return bool(ENABLED_LOGGING_SCOPES & scope) From 397eca8a1fc9a67526c5b8ebda2fc6215d0ef0f3 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 16:53:49 +0200 Subject: [PATCH 12/26] use shared pid lock file to ensure at most one ingest is active --- scripts/automated_ingestion/automated_ingestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 92dac552..7a7f9dc8 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -81,7 +81,7 @@ def parse_args(): return args -@pid.decorator.pidfile('automated_ingestion.pid') +@pidfile('shared_lock.pid') # noqa: F401 def main(): """Main function.""" args = parse_args() From a508c9e15987c00767d632a91a20573ee16f1507 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 16:55:17 +0200 Subject: [PATCH 13/26] add main script and modules to list task files --- .../eessi_remote_storage_client.py | 34 +++ .../automated_ingestion/eessi_s3_bucket.py | 191 +++++++++++++ scripts/automated_ingestion/ingest_bundles.py | 264 ++++++++++++++++++ 3 files changed, 489 insertions(+) create mode 100644 scripts/automated_ingestion/eessi_remote_storage_client.py create mode 100644 scripts/automated_ingestion/eessi_s3_bucket.py create mode 100644 scripts/automated_ingestion/ingest_bundles.py diff --git a/scripts/automated_ingestion/eessi_remote_storage_client.py b/scripts/automated_ingestion/eessi_remote_storage_client.py new file mode 100644 index 00000000..9f83d721 --- /dev/null +++ b/scripts/automated_ingestion/eessi_remote_storage_client.py @@ -0,0 +1,34 @@ +from enum import Enum +from typing import Protocol, runtime_checkable + + +class DownloadMode(Enum): + """Enum defining different modes for downloading files.""" + FORCE = 'force' # Always download and overwrite + CHECK_REMOTE = 'check-remote' # Download if remote files have changed + CHECK_LOCAL = 'check-local' # Download if files don't exist locally (default) + + +@runtime_checkable +class EESSIRemoteStorageClient(Protocol): + """Protocol defining the interface for remote storage clients.""" + + def get_metadata(self, remote_path: str) -> dict: + """Get metadata about a remote object. + + Args: + remote_path: Path to the object in remote storage + + Returns: + Dictionary containing object metadata, including 'ETag' key + """ + ... + + def download(self, remote_path: str, local_path: str) -> None: + """Download a remote file to a local location. + + Args: + remote_path: Path to the object in remote storage + local_path: Local path where to save the file + """ + ... diff --git a/scripts/automated_ingestion/eessi_s3_bucket.py b/scripts/automated_ingestion/eessi_s3_bucket.py new file mode 100644 index 00000000..bc5a8822 --- /dev/null +++ b/scripts/automated_ingestion/eessi_s3_bucket.py @@ -0,0 +1,191 @@ +import os +from pathlib import Path +from typing import Dict, Optional + +import boto3 +from botocore.exceptions import ClientError +from eessi_logging import log_function_entry_exit, log_message, LoggingScope +from eessi_remote_storage_client import EESSIRemoteStorageClient + + +class EESSIS3Bucket(EESSIRemoteStorageClient): + """EESSI-specific S3 bucket implementation of the EESSIRemoteStorageClient protocol.""" + + @log_function_entry_exit() + def __init__(self, config, bucket_name: str): + """ + Initialize the EESSI S3 bucket. + + Args: + config: Configuration object containing: + - aws.access_key_id: AWS access key ID (optional, can use AWS_ACCESS_KEY_ID env var) + - aws.secret_access_key: AWS secret access key (optional, can use AWS_SECRET_ACCESS_KEY env var) + - aws.endpoint_url: Custom endpoint URL for S3-compatible backends (optional) + - aws.verify: SSL verification setting (optional) + - True: Verify SSL certificates (default) + - False: Skip SSL certificate verification + - str: Path to CA bundle file + bucket_name: Name of the S3 bucket to use + """ + self.bucket = bucket_name + + # get AWS credentials from environment or config + aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") or config.get("secrets", "aws_access_key_id") + aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") or config.get("secrets", "aws_secret_access_key") + + # configure boto3 client + client_config = {} + + # add endpoint URL if specified in config + if config.has_option("aws", "endpoint_url"): + client_config["endpoint_url"] = config["aws"]["endpoint_url"] + log_message(LoggingScope.DEBUG, "DEBUG", "Using custom endpoint URL: '%s'", client_config["endpoint_url"]) + + # add SSL verification if specified in config + if config.has_option("aws", "verify"): + verify = config["aws"]["verify"] + if verify.lower() == "false": + client_config["verify"] = False + log_message(LoggingScope.DEBUG, "WARNING", "SSL verification disabled") + elif verify.lower() == "true": + client_config["verify"] = True + else: + client_config["verify"] = verify # assume it's a path to CA bundle + log_message(LoggingScope.DEBUG, "DEBUG", "Using custom CA bundle: '%s'", verify) + + self.client = boto3.client( + "s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + **client_config + ) + log_message(LoggingScope.DEBUG, "INFO", "Initialized S3 client for bucket: '%s'", self.bucket) + + @log_function_entry_exit() + def download(self, remote_path: str, local_path: str) -> None: + """ + Download an S3 object to a local location and store its ETag. + + Args: + remote_path: Path to the object in S3 + local_path: Local path where to save the file + """ + try: + log_message(LoggingScope.DOWNLOAD, "INFO", "Downloading '%s' to '%s'", remote_path, local_path) + self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path) + log_message(LoggingScope.DOWNLOAD, "INFO", "Successfully downloaded '%s' to '%s'", remote_path, local_path) + except ClientError as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to download '%s': '%s'", remote_path, str(err)) + raise + + # get metadata first to obtain the ETag + metadata = self.get_metadata(remote_path) + etag = metadata["ETag"] + + # store the ETag + self._write_etag(local_path, etag) + + @log_function_entry_exit() + def download_file(self, key: str, filename: str) -> None: + """ + Download a file from S3 to a local file. + + Args: + key: The S3 key of the file to download + filename: The local path where the file should be saved + """ + self.client.download_file(self.bucket, key, filename) + + @log_function_entry_exit() + def get_bucket_url(self) -> str: + """ + Get the HTTPS URL for a bucket from an initialized boto3 client. + Works with both AWS S3 and MinIO/S3-compatible services. + """ + try: + # check if this is a custom endpoint (MinIO) or AWS S3 + endpoint_url = self.client.meta.endpoint_url + + if endpoint_url: + # custom endpoint (MinIO, DigitalOcean Spaces, etc.) + # most S3-compatible services use path-style URLs + bucket_url = f"{endpoint_url}/{self.bucket}" + else: + # AWS S3 (no custom endpoint specified) + region = self.client.meta.region_name or 'us-east-1' + + # AWS S3 virtual-hosted-style URLs + if region == "us-east-1": + bucket_url = f"https://{self.bucket}.s3.amazonaws.com" + else: + bucket_url = f"https://{self.bucket}.s3.{region}.amazonaws.com" + + return bucket_url + + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", "Error getting bucket URL: '%s'", str(err)) + return None + + @log_function_entry_exit() + def get_metadata(self, remote_path: str) -> Dict: + """ + Get metadata about an S3 object. + + Args: + remote_path: Path to the object in S3 + + Returns: + Dictionary containing object metadata, including 'ETag' key + """ + try: + log_message(LoggingScope.DEBUG, "DEBUG", "Getting metadata for S3 object: '%s'", remote_path) + response = self.client.head_object(Bucket=self.bucket, Key=remote_path) + log_message(LoggingScope.DEBUG, "DEBUG", "Retrieved metadata for '%s': '%s'", remote_path, response) + return response + except ClientError as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to get metadata for '%s': '%s'", remote_path, str(err)) + raise + + @log_function_entry_exit() + def _get_etag_file_path(self, local_path: str) -> Path: + """Get the path to the .etag file for a given local file.""" + return Path(local_path).with_suffix(".etag") + + @log_function_entry_exit() + def list_objects_v2(self, **kwargs): + """ + List objects in the bucket using the underlying boto3 client. + + Args: + **kwargs: Additional arguments to pass to boto3.client.list_objects_v2 + + Returns: + Response from boto3.client.list_objects_v2 + """ + return self.client.list_objects_v2(Bucket=self.bucket, **kwargs) + + @log_function_entry_exit() + def _read_etag(self, local_path: str) -> Optional[str]: + """Read the ETag from the .etag file if it exists.""" + etag_path = self._get_etag_file_path(local_path) + if etag_path.exists(): + try: + with open(etag_path, "r") as f: + return f.read().strip() + except Exception as e: + log_message(LoggingScope.DEBUG, "WARNING", "Failed to read ETag file '%s': '%s'", etag_path, str(e)) + return None + return None + + @log_function_entry_exit() + def _write_etag(self, local_path: str, etag: str) -> None: + """Write the ETag to the .etag file.""" + etag_path = self._get_etag_file_path(local_path) + try: + with open(etag_path, "w") as f: + f.write(etag) + log_message(LoggingScope.DEBUG, "DEBUG", "Wrote ETag to '%s'", etag_path) + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to write ETag file '%s': '%s'", etag_path, str(err)) + # if we can't write the etag file, it's not critical + # the file will just be downloaded again next time diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py new file mode 100644 index 00000000..f8131213 --- /dev/null +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 + +# from eessi_data_object import EESSIDataAndSignatureObject +# from eessi_task import EESSITask, TaskState +# from eessi_task_description import EESSITaskDescription +from eessi_s3_bucket import EESSIS3Bucket +from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes +from pid.decorator import pidfile # noqa: F401 +from pid import PidFileError + +import argparse +import configparser +# import github +import json +import logging +# import os +import sys +from pathlib import Path +from typing import List + +REQUIRED_CONFIG = { + "secrets": ["aws_secret_access_key", "aws_access_key_id", "github_pat"], + "paths": ["download_dir", "ingestion_script", "metadata_file_extension"], + "aws": ["staging_buckets"], + "github": ["staging_repo", "failed_ingestion_issue_body", "pr_body"], +} + + +@log_function_entry_exit() +def parse_config(path): + """Parse the configuration file.""" + config = configparser.ConfigParser() + try: + config.read(path) + except Exception as err: + error(f"Unable to read configuration file '{path}'!\nException: '{err}'") + + # check if all required configuration parameters/sections can be found + for section in REQUIRED_CONFIG.keys(): + if section not in config: + error(f"Missing section '{section}' in configuration file '{path}'.") + for item in REQUIRED_CONFIG[section]: + if item not in config[section]: + error(f"Missing configuration item '{item}' in section '{section}' of configuration file '{path}'.") + + return config + + +@log_function_entry_exit() +def parse_args(): + """Parse the command-line arguments.""" + parser = argparse.ArgumentParser() + + # logging options + logging_group = parser.add_argument_group("Logging options") + logging_group.add_argument("--log-file", + help="Path to log file (overrides config file setting)") + logging_group.add_argument("--console-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging level for console output (overrides config file setting)") + logging_group.add_argument("--file-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging level for file output (overrides config file setting)") + logging_group.add_argument("--quiet", + action="store_true", + help="Suppress console output (overrides all other console settings)") + logging_group.add_argument("--log-scopes", + help="Comma-separated list of logging scopes using +/- syntax. " + "Examples: '+FUNC_ENTRY_EXIT' (enable only function entry/exit), " + "'+ALL,-FUNC_ENTRY_EXIT' (enable all except function entry/exit), " + "'+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE' (enable function entry/exit but disable example)") + + # existing arguments + parser.add_argument("-c", "--config", type=str, help="path to configuration file", + default="ingest_bundles.cfg", dest="config") + parser.add_argument("-d", "--debug", help="enable debug mode", action="store_true", dest="debug") + parser.add_argument("-l", "--list", help="only list available tasks", action="store_true", dest="list_only") + parser.add_argument("--extensions", help="comma-separated list of extensions to process (default: .task)", + nargs="?", const=".task", default=False) + + return parser.parse_args() + + +@log_function_entry_exit() +def setup_logging(config: configparser.ConfigParser, args: argparse.Namespace) -> logging.Logger: + """ + Configure logging based on configuration file and command line arguments. + Command line arguments take precedence over config file settings. + + Args: + config: Configuration parser + args: Parsed command line arguments + + Returns: + Logger instance + """ + # get settings from config file + log_file = config["logging"].get("log_file") + config_console_level = LOG_LEVELS.get(config["logging"].get("console_level", "INFO").upper(), logging.INFO) + config_file_level = LOG_LEVELS.get(config["logging"].get("file_level", "DEBUG").upper(), logging.DEBUG) + + # override with command line arguments if provided + log_file = args.log_file if args.log_file else log_file + console_level = getattr(logging, args.console_level) if args.console_level else config_console_level + file_level = getattr(logging, args.file_level) if args.file_level else config_file_level + + # debug mode overrides console level + if args.debug: + console_level = logging.DEBUG + + # set up logging scopes + if args.log_scopes: + set_logging_scopes(args.log_scopes) + log_message(LoggingScope.DEBUG, "DEBUG", "Enabled logging scopes: '%s'", args.log_scopes) + + # create logger + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) # set root logger to lowest level + + # create formatters + console_formatter = logging.Formatter("%(levelname)-8s: %(message)s") + file_formatter = logging.Formatter("%(asctime)s - %(levelname)-8s: %(message)s") + + # console handler (only if not quiet) + if not args.quiet: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(console_level) + console_handler.setFormatter(console_formatter) + logger.addHandler(console_handler) + + # file handler (if log file is specified) + if log_file: + # ensure log directory exists + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(file_level) + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + return logger + + +@pidfile("shared_lock.pid") # noqa: F401 +@log_function_entry_exit() +def main(): + """Main function.""" + args = parse_args() + config = parse_config(args.config) + _ = setup_logging(config, args) # noqa: F841 + + # TODO: check configuration: secrets, paths, permissions on dirs, etc + extensions = args.extensions.split(",") + # gh_pat = config["secrets"]["github_pat"] + # gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"]) + + buckets = json.loads(config["aws"]["staging_buckets"]) + for bucket, cvmfs_repo in buckets.items(): + # create our custom S3 bucket for this bucket + s3_bucket = EESSIS3Bucket(config, bucket) + + tasks = find_deployment_tasks(s3_bucket, extensions) + if args.list_only: + log_message(LoggingScope.GROUP_OPS, "INFO", "#tasks: %d", len(tasks)) + for num, task in enumerate(tasks): + log_message(LoggingScope.GROUP_OPS, "INFO", "[%s] %d: '%s'", bucket, num, task) + else: + # process each task file + for task_path in tasks: + log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path) + +# try: +# # Create EESSITask for the task file +# try: +# task = EESSITask( +# EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)), +# config, cvmfs_repo, gh_staging_repo +# ) +# +# except Exception as err: +# log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task %s: %s", +# task_path, str(err)) +# continue +# +# log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task) +# +# previous_state = None +# current_state = task.determine_state() +# log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'", +# task_path, current_state.name) +# while (current_state is not None and +# current_state != TaskState.DONE and +# previous_state != current_state): +# previous_state = current_state +# log_message(LoggingScope.GROUP_OPS, "INFO", +# "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'", +# task_path, previous_state.name, current_state.name) +# current_state = task.handle() +# log_message(LoggingScope.GROUP_OPS, "INFO", +# "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'", +# task_path, previous_state.name, current_state.name) +# +# except Exception as err: +# log_message(LoggingScope.ERROR, "ERROR", "Failed to process task %s: %s", task_path, str(err)) +# continue + + +@log_function_entry_exit() +def find_deployment_tasks(s3_bucket: EESSIS3Bucket, extensions: List[str] = None) -> List[str]: + """ + Return a list of all task files in an S3 bucket with the given extensions, + but only if a corresponding payload file exists (same name without extension). + + Args: + s3_bucket: EESSIS3Bucket instance + extensions: List of file extensions to look for (default: ['.task']) + + Returns: + List of task filenames found in the bucket that have a corresponding payload + """ + if extensions is None: + extensions = [".task"] + + files = [] + continuation_token = None + + while True: + # list objects with pagination + if continuation_token: + response = s3_bucket.list_objects_v2( + ContinuationToken=continuation_token + ) + else: + response = s3_bucket.list_objects_v2() + + # add files from this page + files.extend([obj["Key"] for obj in response.get("Contents", [])]) + + # check if there are more pages + if response.get("IsTruncated"): + continuation_token = response.get("NextContinuationToken") + else: + break + + # create a set of all files for faster lookup + file_set = set(files) + + # return only task files that have a corresponding payload + result = [] + for file in files: + for ext in extensions: + if file.endswith(ext) and file[:-len(ext)] in file_set: + result.append(file) + break # found a matching extension, no need to check other extensions + + return result + + +if __name__ == "__main__": + try: + main() + except PidFileError as err: + error(f"Another instance of this script is already running! Error: '{err}'") From 9847d866c147f68db316fc24271731bdfb5045e0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 17:24:21 +0200 Subject: [PATCH 14/26] add class to model a (remote) file and its signature --- .../automated_ingestion/eessi_data_object.py | 344 ++++++++++++++++++ scripts/automated_ingestion/ingest_bundles.py | 13 +- 2 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_data_object.py diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py new file mode 100644 index 00000000..4989f24c --- /dev/null +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -0,0 +1,344 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +import configparser +import subprocess + +from eessi_logging import log_function_entry_exit, log_message, LoggingScope +from eessi_remote_storage_client import DownloadMode, EESSIRemoteStorageClient + + +@dataclass +class EESSIDataAndSignatureObject: + """Class representing an EESSI data file and its signature in remote storage and locally.""" + + # configuration + config: configparser.ConfigParser + + # remote paths + remote_file_path: str # path to data file in remote storage + remote_sig_path: str # path to signature file in remote storage + + # local paths + local_file_path: Path # path to local data file + local_sig_path: Path # path to local signature file + + # remote storage client + remote_client: EESSIRemoteStorageClient + + @log_function_entry_exit() + def __init__( + self, + config: configparser.ConfigParser, + remote_file_path: str, + remote_client: EESSIRemoteStorageClient, + ): + """ + Initialize an EESSI data and signature object handler. + + Args: + config: configuration object containing remote storage and local directory information + remote_file_path: path to data file in remote storage + remote_client: remote storage client implementing the EESSIRemoteStorageClient protocol + """ + self.config = config + self.remote_file_path = remote_file_path + sig_ext = config["signatures"]["signature_file_extension"] + self.remote_sig_path = remote_file_path + sig_ext + + # set up local paths + local_dir = Path(config["paths"]["download_dir"]) + # use the full remote path structure, removing any leading slashes + remote_path = remote_file_path.lstrip("/") + self.local_file_path = local_dir.joinpath(remote_path) + self.local_sig_path = local_dir.joinpath(remote_path + sig_ext) + self.remote_client = remote_client + + log_message(LoggingScope.DEBUG, "DEBUG", "Initialized EESSIDataAndSignatureObject for '%s'", remote_file_path) + log_message(LoggingScope.DEBUG, "DEBUG", "Local file path: '%s'", self.local_file_path) + log_message(LoggingScope.DEBUG, "DEBUG", "Local signature path: '%s'", self.local_sig_path) + + @log_function_entry_exit() + def _get_etag_file_path(self, local_path: Path) -> Path: + """Get the path to the .etag file for a given local file.""" + return local_path.with_suffix(".etag") + + @log_function_entry_exit() + def _get_local_etag(self, local_path: Path) -> Optional[str]: + """Get the ETag of a local file from its .etag file.""" + etag_path = self._get_etag_file_path(local_path) + if etag_path.exists(): + try: + with open(etag_path, "r") as f: + return f.read().strip() + except Exception as err: + log_message(LoggingScope.DEBUG, "WARNING", "Failed to read ETag file '%s': '%s'", etag_path, str(err)) + return None + return None + + @log_function_entry_exit() + def get_etags(self) -> tuple[Optional[str], Optional[str]]: + """ + Get the ETags of both the data file and its signature. + + Returns: + Tuple containing (data_file_etag, signature_file_etag) + """ + return ( + self._get_local_etag(self.local_file_path), + self._get_local_etag(self.local_sig_path) + ) + + @log_function_entry_exit() + def verify_signature(self) -> bool: + """ + Verify the signature of the data file using the corresponding signature file. + + Returns: + bool: True if the signature is valid or if signatures are not required, False otherwise + """ + # check if signature file exists + if not self.local_sig_path.exists(): + log_message(LoggingScope.VERIFICATION, "WARNING", "Signature file '%s' is missing", + self.local_sig_path) + + # if signatures are required, return failure + if self.config["signatures"].getboolean("signatures_required", True): + log_message(LoggingScope.ERROR, "ERROR", "Signature file '%s' is missing and signatures are required", + self.local_sig_path) + return False + else: + log_message(LoggingScope.VERIFICATION, "INFO", + "Signature file '%s' is missing, but signatures are not required", + self.local_sig_path) + return True + + # if signatures are provided, we should always verify them, regardless of the signatures_required setting + verify_runenv = self.config["signatures"]["signature_verification_runenv"].split() + verify_script = self.config["signatures"]["signature_verification_script"] + allowed_signers_file = self.config["signatures"]["allowed_signers_file"] + + # check if verification tools exist + if not Path(verify_script).exists(): + log_message(LoggingScope.ERROR, "ERROR", + "Unable to verify signature: verification script '%s' does not exist", verify_script) + return False + + if not Path(allowed_signers_file).exists(): + log_message(LoggingScope.ERROR, "ERROR", + "Unable to verify signature: allowed signers file '%s' does not exist", allowed_signers_file) + return False + + # run the verification command with named parameters + cmd = verify_runenv + [ + verify_script, + "--verify", + "--allowed-signers-file", allowed_signers_file, + "--file", str(self.local_file_path), + "--signature-file", str(self.local_sig_path) + ] + log_message(LoggingScope.VERIFICATION, "INFO", "Running command: '%s'", " ".join(cmd)) + + try: + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + log_message(LoggingScope.VERIFICATION, "INFO", + "Successfully verified signature for '%s'", self.local_file_path) + log_message(LoggingScope.VERIFICATION, "DEBUG", " stdout: '%s'", result.stdout) + log_message(LoggingScope.VERIFICATION, "DEBUG", " stderr: '%s'", result.stderr) + return True + else: + log_message(LoggingScope.ERROR, "ERROR", + "Signature verification failed for '%s'", self.local_file_path) + log_message(LoggingScope.ERROR, "ERROR", " stdout: '%s'", result.stdout) + log_message(LoggingScope.ERROR, "ERROR", " stderr: '%s'", result.stderr) + return False + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", + "Error during signature verification for '%s': '%s'", + self.local_file_path, str(err)) + return False + + @log_function_entry_exit() + def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: + """ + Download data file and signature based on the specified mode. + + Args: + mode: Download mode to use + + Returns: + True if files were downloaded, False otherwise + """ + # if mode is FORCE, we always download regardless of local or remote state + if mode == DownloadMode.FORCE: + should_download = True + log_message(LoggingScope.DOWNLOAD, "INFO", "Forcing download of '%s'", self.remote_file_path) + # for CHECK_REMOTE mode, check if we can optimize + elif mode == DownloadMode.CHECK_REMOTE: + # optimization: check if local files exist first + local_files_exist = ( + self.local_file_path.exists() and + self.local_sig_path.exists() + ) + + # if files don't exist locally, we can skip ETag checks + if not local_files_exist: + log_message(LoggingScope.DOWNLOAD, "INFO", + "Local files missing, skipping ETag checks and downloading '%s'", + self.remote_file_path) + should_download = True + else: + # first check if we have local ETags + try: + local_file_etag = self._get_local_etag(self.local_file_path) + local_sig_etag = self._get_local_etag(self.local_sig_path) + + if local_file_etag: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Local file ETag: '%s'", local_file_etag) + else: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "No local file ETag found") + if local_sig_etag: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Local signature ETag: '%s'", local_sig_etag) + else: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "No local signature ETag found") + + # if we don't have local ETags, we need to download + if not local_file_etag or not local_sig_etag: + should_download = True + log_message(LoggingScope.DOWNLOAD, "INFO", "Missing local ETags, downloading '%s'", + self.remote_file_path) + else: + # get remote ETags and compare + remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)["ETag"] + remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)["ETag"] + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Remote file ETag: '%s'", remote_file_etag) + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Remote signature ETag: '%s'", remote_sig_etag) + + should_download = ( + remote_file_etag != local_file_etag or + remote_sig_etag != local_sig_etag + ) + if should_download: + if remote_file_etag != local_file_etag: + log_message(LoggingScope.DOWNLOAD, "INFO", "File ETag changed from '%s' to '%s'", + local_file_etag, remote_file_etag) + if remote_sig_etag != local_sig_etag: + log_message(LoggingScope.DOWNLOAD, "INFO", "Signature ETag changed from '%s' to '%s'", + local_sig_etag, remote_sig_etag) + log_message(LoggingScope.DOWNLOAD, "INFO", "Remote files have changed, downloading '%s'", + self.remote_file_path) + else: + log_message(LoggingScope.DOWNLOAD, "INFO", + "Remote files unchanged, skipping download of '%s'", + self.remote_file_path) + except Exception as etag_err: + # if we get any error with ETags, we'll just download the files + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error handling ETags, will download files: '%s'", + str(etag_err)) + should_download = True + else: # check_local + should_download = ( + not self.local_file_path.exists() or + not self.local_sig_path.exists() + ) + if should_download: + if not self.local_file_path.exists(): + log_message(LoggingScope.DOWNLOAD, "INFO", "Local file missing: '%s'", self.local_file_path) + if not self.local_sig_path.exists(): + log_message(LoggingScope.DOWNLOAD, "INFO", "Local signature missing: '%s'", self.local_sig_path) + log_message(LoggingScope.DOWNLOAD, "INFO", "Local files missing, downloading '%s'", + self.remote_file_path) + else: + log_message(LoggingScope.DOWNLOAD, "INFO", "Local files exist, skipping download of '%s'", + self.remote_file_path) + + if not should_download: + return False + + # ensure local directory exists + self.local_file_path.parent.mkdir(parents=True, exist_ok=True) + + # download files + try: + # download the main file first + self.remote_client.download(self.remote_file_path, str(self.local_file_path)) + + # get and log the ETag of the downloaded file + try: + file_etag = self._get_local_etag(self.local_file_path) + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Downloaded '%s' with ETag: '%s'", + self.remote_file_path, file_etag) + except Exception as etag_err: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error getting ETag for '%s': '%s'", + self.remote_file_path, str(etag_err)) + + # try to download the signature file + try: + self.remote_client.download(self.remote_sig_path, str(self.local_sig_path)) + try: + sig_etag = self._get_local_etag(self.local_sig_path) + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Downloaded '%s' with ETag: '%s'", + self.remote_sig_path, sig_etag) + except Exception as etag_err: + log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error getting ETag for '%s': '%s'", + self.remote_sig_path, str(etag_err)) + log_message(LoggingScope.DOWNLOAD, "INFO", "Successfully downloaded '%s' and its signature", + self.remote_file_path) + except Exception as sig_err: + # check if signatures are required + if self.config["signatures"].getboolean("signatures_required", True): + # if signatures are required, clean up everything since we can't proceed + if self.local_file_path.exists(): + self.local_file_path.unlink() + # clean up etag files regardless of whether their data files exist + file_etag_path = self._get_etag_file_path(self.local_file_path) + if file_etag_path.exists(): + file_etag_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() + log_message(LoggingScope.ERROR, "ERROR", "Failed to download required signature for '%s': '%s'", + self.remote_file_path, str(sig_err)) + raise + else: + # if signatures are optional, just clean up any partial signature files + if self.local_sig_path.exists(): + self.local_sig_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() + log_message(LoggingScope.DOWNLOAD, "WARNING", + "Failed to download optional signature for '%s': '%s'", + self.remote_file_path, str(sig_err)) + log_message(LoggingScope.DOWNLOAD, "INFO", + "Successfully downloaded '%s' (signature optional)", + self.remote_file_path) + + return True + except Exception as err: + # this catch block is only for errors in the main file download + # clean up partially downloaded files and their etags + if self.local_file_path.exists(): + self.local_file_path.unlink() + if self.local_sig_path.exists(): + self.local_sig_path.unlink() + # clean up etag files regardless of whether their data files exist + file_etag_path = self._get_etag_file_path(self.local_file_path) + if file_etag_path.exists(): + file_etag_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() + log_message(LoggingScope.ERROR, "ERROR", "Failed to download '%s': '%s'", self.remote_file_path, str(err)) + raise + + @log_function_entry_exit() + def get_url(self) -> str: + """Get the URL of the data file.""" + return f"https://{self.remote_client.bucket}.s3.amazonaws.com/{self.remote_file_path}" + + def __str__(self) -> str: + """Return a string representation of the EESSI data and signature object.""" + return f"EESSIDataAndSignatureObject({self.remote_file_path})" diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py index f8131213..fa40fb33 100644 --- a/scripts/automated_ingestion/ingest_bundles.py +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# from eessi_data_object import EESSIDataAndSignatureObject +from eessi_data_object import EESSIDataAndSignatureObject # from eessi_task import EESSITask, TaskState # from eessi_task_description import EESSITaskDescription from eessi_s3_bucket import EESSIS3Bucket @@ -170,8 +170,9 @@ def main(): for task_path in tasks: log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path) -# try: -# # Create EESSITask for the task file + try: + _ = EESSIDataAndSignatureObject(config, task_path, s3_bucket) +# # create EESSITask for the task file # try: # task = EESSITask( # EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)), @@ -201,9 +202,9 @@ def main(): # "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'", # task_path, previous_state.name, current_state.name) # -# except Exception as err: -# log_message(LoggingScope.ERROR, "ERROR", "Failed to process task %s: %s", task_path, str(err)) -# continue + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to process task '%s': '%s'", task_path, str(err)) + continue @log_function_entry_exit() From 79ef2671616bef2050e52a5ac24a97941806b0b1 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 17:49:32 +0200 Subject: [PATCH 15/26] add class to model the description of a task --- .../eessi_task_description.py | 188 ++++++++++++++++++ scripts/automated_ingestion/ingest_bundles.py | 4 +- 2 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_task_description.py diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py new file mode 100644 index 00000000..24cc6df7 --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -0,0 +1,188 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Tuple + +import json + +from eessi_data_object import EESSIDataAndSignatureObject +from eessi_logging import log_function_entry_exit, log_message, LoggingScope +from eessi_remote_storage_client import DownloadMode + + +@dataclass +class EESSITaskDescription: + """Class representing an EESSI task to be performed, including its metadata and associated data files.""" + + # The EESSI data and signature object associated with this task + task_object: EESSIDataAndSignatureObject + + # Whether the signature was successfully verified + signature_verified: bool = False + + # Metadata from the task description file + metadata: Dict[str, Any] = None + + # task element + task: Dict[str, Any] = None + + # source element + source: Dict[str, Any] = None + + @log_function_entry_exit() + def __init__(self, task_object: EESSIDataAndSignatureObject): + """ + Initialize an EESSITaskDescription object. + + Args: + task_object: The EESSI data and signature object associated with this task + """ + self.task_object = task_object + self.metadata = {} + + self.task_object.download(mode=DownloadMode.CHECK_REMOTE) + + # verify signature and set initial state + self.signature_verified = self.task_object.verify_signature() + + # try to read metadata (will only succeed if signature is verified) + try: + self._read_metadata() + except RuntimeError: + # expected if signature is not verified yet + pass + + # check if the task file contains a task field and add that to self + if "task" in self.metadata: + self.task = self.metadata["task"] + else: + self.task = None + + # check if the task file contains a link2pr field and add that to source element + if "link2pr" in self.metadata: + self.source = self.metadata["link2pr"] + else: + self.source = None + + @log_function_entry_exit() + def get_contents(self) -> str: + """ + Get the contents of the task description / metadata file. + """ + return self.raw_contents + + @log_function_entry_exit() + def get_metadata_filename_components(self) -> Tuple[str, str, str, str, str, str]: + """ + Get the components of the metadata file name. + + An example of the metadata file name is: + eessi-2023.06-software-linux-x86_64-amd-zen2-1745557626.tar.gz.meta.txt + + The components are: + eessi: some prefix + VERSION: 2023.06 + COMPONENT: software + OS: linux + ARCHITECTURE: x86_64-amd-zen2 + TIMESTAMP: 1745557626 + SUFFIX: tar.gz.meta.txt + + The ARCHITECTURE component can include one to two hyphens. + The SUFFIX is the part after the first dot (no other components should include dots). + """ + # obtain file name from local file path using basename + file_name = Path(self.task_object.local_file_path).name + # split file_name into part before suffix and the suffix + # idea: split on last hyphen, then split on first dot + suffix = file_name.split("-")[-1].split(".", 1)[1] + file_name_without_suffix = file_name.strip(f".{suffix}") + # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element), + # ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element) + components = file_name_without_suffix.split("-") + version = components[1] + component = components[2] + os = components[3] + architecture = "-".join(components[4:-1]) + timestamp = components[-1] + return version, component, os, architecture, timestamp, suffix + + @log_function_entry_exit() + def get_metadata_value(self, key: str) -> str: + """ + Get the value of a key from the task description / metadata file. + """ + # check that key is defined and has a length > 0 + if not key or len(key) == 0: + raise ValueError("get_metadata_value: key is not defined or has a length of 0") + + value = None + task = self.task + source = self.source + # check if key is in task or source + if task and key in task: + value = task[key] + log_message(LoggingScope.TASK_OPS, "INFO", + f"Value '{value}' for key '{key}' found in information from task metadata: {task}") + elif source and key in source: + value = source[key] + log_message(LoggingScope.TASK_OPS, "INFO", + f"Value '{value}' for key '{key}' found in information from source metadata: {source}") + else: + log_message(LoggingScope.TASK_OPS, "INFO", + f"Value for key '{key}' neither found in task metadata nor source metadata") + raise ValueError(f"Value for key '{key}' neither found in task metadata nor source metadata") + return value + + @log_function_entry_exit() + def get_pr_number(self) -> str: + """ + Get the PR number from the task description / metadata file. + """ + return self.get_metadata_value("pr") + + @log_function_entry_exit() + def get_repo_name(self) -> str: + """ + Get the repository name from the task description / metadata file. + """ + return self.get_metadata_value("repo") + + @log_function_entry_exit() + def get_task_file_name(self) -> str: + """ + Get the file name from the task description / metadata file. + """ + # get file name from remote file path using basename + file_name = Path(self.task_object.remote_file_path).name + return file_name + + @log_function_entry_exit() + def _read_metadata(self) -> None: + """ + Internal method to read and parse the metadata from the task description file. + Only reads metadata if the signature has been verified. + """ + if not self.signature_verified: + log_message(LoggingScope.ERROR, "ERROR", "Cannot read metadata: signature not verified for '%s'", + self.task_object.local_file_path) + raise RuntimeError("Cannot read metadata: signature not verified") + + try: + with open(self.task_object.local_file_path, "r") as file: + self.raw_contents = file.read() + self.metadata = json.loads(self.raw_contents) + log_message(LoggingScope.DEBUG, "DEBUG", "Successfully read metadata from '%s'", + self.task_object.local_file_path) + except json.JSONDecodeError as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to parse JSON in task description file '%s': '%s'", + self.task_object.local_file_path, str(err)) + raise + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to read task description file '%s': '%s'", + self.task_object.local_file_path, str(err)) + raise + + @log_function_entry_exit() + def __str__(self) -> str: + """Return a string representation of the EESSITaskDescription object.""" + return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py index fa40fb33..0662fc83 100644 --- a/scripts/automated_ingestion/ingest_bundles.py +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -2,7 +2,7 @@ from eessi_data_object import EESSIDataAndSignatureObject # from eessi_task import EESSITask, TaskState -# from eessi_task_description import EESSITaskDescription +from eessi_task_description import EESSITaskDescription from eessi_s3_bucket import EESSIS3Bucket from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes from pid.decorator import pidfile # noqa: F401 @@ -171,7 +171,7 @@ def main(): log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path) try: - _ = EESSIDataAndSignatureObject(config, task_path, s3_bucket) + _ = EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)) # # create EESSITask for the task file # try: # task = EESSITask( From 519b94f56e77dd4d7d46e6e071acd65adf7d13e9 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 17:56:29 +0200 Subject: [PATCH 16/26] add class to model different types of actions on CVMFS repo --- scripts/automated_ingestion/eessi_task_action.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 scripts/automated_ingestion/eessi_task_action.py diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py new file mode 100644 index 00000000..6f141435 --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_action.py @@ -0,0 +1,12 @@ +from enum import Enum, auto + + +class EESSITaskAction(Enum): + NOP = auto() # perform no action + DELETE = auto() # perform a delete operation + ADD = auto() # perform an add operation + UPDATE = auto() # perform an update operation + UNKNOWN = auto() # unknown action + + def __str__(self): + return self.name.lower() From f3fce42cbcafbf8df234a089949ad6774c5fe03c Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 18:02:18 +0200 Subject: [PATCH 17/26] add class to model payload of a deployment task --- .../automated_ingestion/eessi_task_payload.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/automated_ingestion/eessi_task_payload.py diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py new file mode 100644 index 00000000..fe0db162 --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -0,0 +1,109 @@ +from dataclasses import dataclass +from pathlib import PurePosixPath +from typing import Dict + +import os +import tarfile + +from eessi_data_object import EESSIDataAndSignatureObject +from eessi_logging import log_function_entry_exit +from eessi_remote_storage_client import DownloadMode + + +@dataclass +class EESSITaskPayload: + """Class representing an EESSI task payload (tarball/artifact) and its signature.""" + + # The EESSI data and signature object associated with this payload + payload_object: EESSIDataAndSignatureObject + + # Whether the signature was successfully verified + signature_verified: bool = False + + # possibly at a later point in time, we will add inferred metadata here + # such as the prefix in a tarball, the main elements, or which software + # package it includes + + @log_function_entry_exit() + def __init__(self, payload_object: EESSIDataAndSignatureObject): + """ + Initialize an EESSITaskPayload object. + + Args: + payload_object: The EESSI data and signature object associated with this payload + """ + self.payload_object = payload_object + + # download the payload and its signature + self.payload_object.download(mode=DownloadMode.CHECK_REMOTE) + + # verify signature + self.signature_verified = self.payload_object.verify_signature() + + @log_function_entry_exit() + def analyse_contents(self, config: Dict) -> str: + """Analyse the contents of the payload and return a summary in a ready-to-use HTML format.""" + tar = tarfile.open(self.payload_object.local_file_path, "r") + members = tar.getmembers() + tar_num_members = len(members) + paths = sorted([m.path for m in members]) + + if tar_num_members < 100: + tar_members_desc = "Full listing of the contents of the tarball:" + members_list = paths + + else: + tar_members_desc = "Summarized overview of the contents of the tarball:" + # determine prefix after filtering out '/init' subdirectory, + # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1') + init_subdir = os.path.join("*", "init") + non_init_paths = sorted( + [path for path in paths if not any(parent.match(init_subdir) for parent in PurePosixPath(path).parents)] + ) + if non_init_paths: + prefix = os.path.commonprefix(non_init_paths) + else: + prefix = os.path.commonprefix(paths) + + # TODO: this only works for software tarballs, how to handle compat layer tarballs? + swdirs = [ # all directory names with the pattern: /software// + member.path + for member in members + if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, "software", "*", "*")) + ] + modfiles = [ # all filenames with the pattern: /modules///*.lua + member.path + for member in members + if member.isfile() + and PurePosixPath(member.path).match(os.path.join(prefix, "modules", "*", "*", "*.lua")) + ] + other = [ # anything that is not in /software nor /modules + member.path + for member in members + if ( + not PurePosixPath(prefix).joinpath("software") in PurePosixPath(member.path).parents + and not PurePosixPath(prefix).joinpath("modules") in PurePosixPath(member.path).parents + ) + # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*')) + # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*')) + ] + members_list = sorted(swdirs + modfiles + other) + + # construct the overview + overview = config["github"]["task_summary_payload_overview_template"].format( + tar_num_members=tar_num_members, + bucket_url=self.payload_object.remote_client.get_bucket_url(), + remote_file_path=self.payload_object.remote_file_path, + tar_members_desc=tar_members_desc, + tar_members="\n".join(members_list) + ) + + # make sure that the overview does not exceed Github's maximum length (65536 characters) + if len(overview) > 60000: + overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```" + return overview + + @log_function_entry_exit() + def __str__(self) -> str: + """Return a string representation of the EESSITaskPayload object.""" + return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})" From b37901229268d7f380d64d89600cab500e3ae853 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 19:04:28 +0200 Subject: [PATCH 18/26] add class modelling a deployment task --- scripts/automated_ingestion/eessi_task.py | 1393 +++++++++++++++++ scripts/automated_ingestion/ingest_bundles.py | 38 +- 2 files changed, 1411 insertions(+), 20 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_task.py diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py new file mode 100644 index 00000000..f2369db6 --- /dev/null +++ b/scripts/automated_ingestion/eessi_task.py @@ -0,0 +1,1393 @@ +from enum import Enum, auto +from functools import total_ordering +from typing import Dict, List, Tuple, Optional, Any + +import base64 +import os +import subprocess +import traceback + +from eessi_data_object import EESSIDataAndSignatureObject +from eessi_logging import log_function_entry_exit, log_message, LoggingScope +from eessi_task_action import EESSITaskAction +from eessi_task_description import EESSITaskDescription +from eessi_task_payload import EESSITaskPayload +from utils import send_slack_message + +from github import Github, GithubException, InputGitTreeElement, UnknownObjectException +from github.Branch import Branch +from github.PullRequest import PullRequest + + +@total_ordering +class EESSITaskState(Enum): + UNDETERMINED = auto() # The task state was not determined yet + NEW_TASK = auto() # The task has been created but not yet processed + PAYLOAD_STAGED = auto() # The task's payload has been staged to the Stratum-0 + PULL_REQUEST = auto() # A PR for the task has been created or updated in some staging repository + APPROVED = auto() # The PR for the task has been approved + REJECTED = auto() # The PR for the task has been rejected + INGESTED = auto() # The task's payload has been applied to the target CernVM-FS repository + DONE = auto() # The task has been completed + + @classmethod + def from_string( + cls, name: str, default: Optional["EESSITaskState"] = None, case_sensitive: bool = False + ) -> "EESSITaskState": + log_message(LoggingScope.TASK_OPS, "INFO", "from_string: '%s'", name) + if case_sensitive: + to_return = cls.__members__.get(name, default) + log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return) + return to_return + + try: + to_return = cls[name.upper()] + log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return) + return to_return + except KeyError: + return default + + def __lt__(self, other): + if self.__class__ is other.__class__: + return self.value < other.value + return NotImplemented + + def __str__(self): + return self.name.upper() + + +class EESSITask: + description: EESSITaskDescription + payload: EESSITaskPayload + action: EESSITaskAction + git_repo: Github + config: Dict + + @log_function_entry_exit() + def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: str, git_repo: Github): + self.description = description + self.config = config + self.cvmfs_repo = cvmfs_repo + self.git_repo = git_repo + self.action = self._determine_task_action() + + # define valid state transitions for all actions + # NOTE, for EESSITaskState.PULL_REQUEST, EESSITaskState.APPROVED must be the first element or + # _next_state() will not work correctly + self.valid_transitions = { + EESSITaskState.UNDETERMINED: [ + EESSITaskState.NEW_TASK, + EESSITaskState.PAYLOAD_STAGED, + EESSITaskState.PULL_REQUEST, + EESSITaskState.APPROVED, + EESSITaskState.REJECTED, + EESSITaskState.INGESTED, + EESSITaskState.DONE, + ], + EESSITaskState.NEW_TASK: [EESSITaskState.PAYLOAD_STAGED], + EESSITaskState.PAYLOAD_STAGED: [EESSITaskState.PULL_REQUEST], + EESSITaskState.PULL_REQUEST: [EESSITaskState.APPROVED, EESSITaskState.REJECTED], + EESSITaskState.APPROVED: [EESSITaskState.INGESTED], + EESSITaskState.REJECTED: [], # terminal state + EESSITaskState.INGESTED: [], # terminal state + EESSITaskState.DONE: [] # virtual terminal state, not used to write on GitHub + } + + self.payload = None + state = self.determine_state() + if state >= EESSITaskState.PAYLOAD_STAGED: + log_message(LoggingScope.TASK_OPS, "INFO", "initializing payload object in constructor for EESSITask") + self._init_payload_object() + + @log_function_entry_exit() + def _determine_task_action(self) -> EESSITaskAction: + """ + Determine the action type based on task description metadata. + """ + if "task" in self.description.metadata and "action" in self.description.metadata["task"]: + action_str = self.description.metadata["task"]["action"].lower() + if action_str == "nop": + return EESSITaskAction.NOP + elif action_str == "delete": + return EESSITaskAction.DELETE + elif action_str == "add": + return EESSITaskAction.ADD + elif action_str == "update": + return EESSITaskAction.UPDATE + return EESSITaskAction.UNKNOWN + + @log_function_entry_exit() + def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool: + """ + Check if a file exists in a repository branch. + + Args: + file_path_prefix: the prefix of the file path + branch_name: the branch to check + + Returns: + True if a file with the prefix exists in the branch, False otherwise + """ + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + # branch = self._get_branch_from_name(branch_name) + try: + # get all files in directory part of file_path_prefix + directory_part = os.path.dirname(file_path_prefix) + files = self.git_repo.get_contents(directory_part, ref=branch_name) + log_msg = "Found files %s in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name) + # check if any of the files has file_path_prefix as prefix + for file in files: + if file.path.startswith(file_path_prefix): + log_msg = "Found file %s in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name) + return True + log_msg = "No file with prefix %s found in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name) + return False + except UnknownObjectException: + # file_path does not exist in branch + log_msg = "Directory %s or file with prefix %s does not exist in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) + return False + except GithubException as err: + if err.status == 404: + # file_path does not exist in branch + log_msg = "Directory %s or file with prefix %s does not exist in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) + return False + else: + # if there was some other (e.g. connection) issue, log message and return False + log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' + log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) + return False + return False + + @log_function_entry_exit() + def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]: + """ + Determines in which sequence numbers the metadata/task file is included and in which it is not. + NOTE, we only need to check the default branch of the repository, because a for a new task a file + is added to the default branch and for the subsequent processing of the task we use a different branch. + Thus, until the PR is closed, the task file stays in the default branch. + + Args: + repo: the repository name + pr: the pull request number + + Returns: + A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is + included in that sequence number. + + Idea: + - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique + sequence number. + - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named + `REPO/PR_NUM/SEQ_NUM` is created. + - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`. + - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory. + - We iterate over all directories under `REPO/PR_NUM`: + - If the metadata/task file is available in the directory, we add the sequence number to the list. + + Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number. + """ + sequence_numbers = {} + repo_pr_dir = f"{repo}/{pr}" + # iterate over all directories under repo_pr_dir + try: + directories = self._list_directory_contents(repo_pr_dir) + for dir in directories: + # check if the directory is a number + if dir.name.isdigit(): + # determine if a state file with prefix exists in the sequence number directory + # we need to use the basename of the remote file path + remote_file_path_basename = os.path.basename(self.description.task_object.remote_file_path) + state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path_basename}" + if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix): + sequence_numbers[int(dir.name)] = True + else: + sequence_numbers[int(dir.name)] = False + else: + # directory is not a number, so we skip it + continue + except FileNotFoundError: + # repo_pr_dir does not exist, so we return an empty dictionary + return {} + except GithubException as err: + if err.status != 404: # 404 is catched by FileNotFoundError + # some other error than the directory not existing + return {} + return sequence_numbers + + @log_function_entry_exit() + def _find_highest_number(self, str_list: List[str]) -> int: + """ + Find the highest number in a list of strings. + """ + # Convert all strings to integers + int_list = [int(num) for num in str_list] + return max(int_list) + + @log_function_entry_exit() + def _get_sequence_number_for_task_file(self) -> int: + """ + Get the sequence number this task is assigned to at the moment. + NOTE, should only be called if the task is actually assigned to a sequence number. + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + raise ValueError("Found no sequence numbers at all") + else: + # get all entries with value True, there should be only one, so we return the first one + sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True] + if len(sequence_numbers_true) == 0: + raise ValueError("Found no sequence numbers that include the task file for task %s", + self.description) + else: + return sequence_numbers_true[0] + + @log_function_entry_exit() + def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int: + """ + Get the current sequence number based on the sequence numbers. + If sequence_numbers is not provided, we determine the sequence numbers from the task description. + """ + if sequence_numbers is None: + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + return 0 + return self._find_highest_number(sequence_numbers.keys()) + + @log_function_entry_exit() + def _get_fixed_sequence_number(self) -> int: + """ + Get a fixed sequence number. + """ + return 11 + + @log_function_entry_exit() + def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]: + """ + Find the staging PR for the task. + TODO: arg sequence number --> make function simpler + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + try: + sequence_number = self._get_sequence_number_for_task_file() + except ValueError: + # no sequence number found, so we return None + log_message(LoggingScope.ERROR, "ERROR", "no sequence number found for task '%s'", self.description) + return None, None, None + except Exception as err: + # some other error + log_message(LoggingScope.ERROR, "ERROR", "error finding staging PR for task '%s': '%s'", + self.description, err) + return None, None, None + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + if branch_name in [branch.name for branch in self.git_repo.get_branches()]: + find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state="all")] + if find_pr: + pr = find_pr.pop(0) + return pr, branch_name, sequence_number + else: + return None, branch_name, sequence_number + else: + return None, None, None + + @log_function_entry_exit() + def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]: + """ + Create a staging PR for the task. + NOTE, SHALL only be called if no staging PR for the task exists yet. + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + default_branch_name = self.git_repo.default_branch + pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", + body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", + head=branch_name, base=default_branch_name) + return pr, branch_name + + @log_function_entry_exit() + def _find_state(self) -> EESSITaskState: + """ + Determine the state of the task based on the task description metadata. + + Returns: + The state of the task. + """ + # obtain repo and pr from metadata + log_message(LoggingScope.TASK_OPS, "INFO", "finding state of task '%s'", self.description.task_object) + repo = self.description.get_repo_name() + pr = self.description.get_pr_number() + log_message(LoggingScope.TASK_OPS, "INFO", "repo: '%s', pr: '%s'", repo, pr) + + # obtain all sequence numbers in repo/pr dir which include a state file for this task + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) + if len(sequence_numbers) == 0: + # no sequence numbers found, so we return NEW_TASK + log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, state: NEW_TASK") + return EESSITaskState.NEW_TASK + # we got at least one sequence number + # if one value for a sequence number is True, we can determine the state from the file in the directory + sequence_including_task = [key for key, value in sequence_numbers.items() if value is True] + if len(sequence_including_task) == 0: + # no sequence number includes the task file, so we return NEW_TASK + log_message(LoggingScope.TASK_OPS, "INFO", "no sequence number includes the task file, state: NEW_TASK") + return EESSITaskState.NEW_TASK + # we got at least one sequence number which includes the task file + # we can determine the state from the filename in the directory + # NOTE, we use the first element in sequence_including_task (there should be only one) + # we ignore other elements in sequence_including_task + sequence_number = sequence_including_task[0] + task_file_name = self.description.get_task_file_name() + metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}." + state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number) + log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + return state + + @log_function_entry_exit() + def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str, + sequence_number: int) -> EESSITaskState: + """ + Get the state from the file in the metadata_file_state_path_prefix. + """ + # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, + # INGESTED, DONE) + # we need to check the task file in the default branch or in the branch corresponding to the sequence number + directory_part = os.path.dirname(metadata_file_state_path_prefix) + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + default_branch_name = self.git_repo.default_branch + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + all_branch_names = [branch.name for branch in self.git_repo.get_branches()] + states = [] + for branch in [default_branch_name, branch_name]: + if branch in all_branch_names: + # first get all files in directory part of metadata_file_state_path_prefix + files = self._list_directory_contents(directory_part, branch) + # check if any of the files has metadata_file_state_path_prefix as prefix + for file in files: + if file.path.startswith(metadata_file_state_path_prefix): + # get state from file name taking only the suffix + state = EESSITaskState.from_string(file.name.split(".")[-1]) + log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + states.append(state) + if len(states) == 0: + # did not find any file with metadata_file_state_path_prefix as prefix + log_message(LoggingScope.TASK_OPS, "INFO", "did not find any file with prefix '%s'", + metadata_file_state_path_prefix) + return EESSITaskState.NEW_TASK + # sort the states and return the last one + states.sort() + state = states[-1] + log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + return state + + @log_function_entry_exit() + def _list_directory_contents(self, directory_path: str, branch_name: str = None) -> List[Any]: + """ + List the contents of a directory in a branch. + """ + try: + # Get contents of the directory + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + log_message(LoggingScope.TASK_OPS, "INFO", "listing contents of '%s' in branch '%s'", + directory_path, branch_name) + contents = self.git_repo.get_contents(directory_path, ref=branch_name) + + # If contents is a list, it means we successfully got directory contents + if isinstance(contents, list): + return contents + else: + # If it's not a list, it means the path is not a directory + raise ValueError(f"'{directory_path}' is not a directory") + except GithubException as err: + if err.status == 404: + raise FileNotFoundError(f"Directory not found: '{directory_path}'") + raise err + + @log_function_entry_exit() + def _next_state(self, state: EESSITaskState = None) -> EESSITaskState: + """ + Determine the next state based on the current state using the valid_transitions dictionary. + + NOTE, it assumes that function is only called for non-terminal states and that the next state is the first + element of the list returned by the valid_transitions dictionary. + """ + the_state = state if state is not None else self.determine_state() + return self.valid_transitions[the_state][0] + + @log_function_entry_exit() + def _path_exists_in_branch(self, path: str, branch_name: str = None) -> bool: + """ + Check if a path exists in a branch. + """ + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + try: + self.git_repo.get_contents(path, ref=branch_name) + return True + except GithubException as err: + if err.status == 404: + return False + else: + raise err + + @log_function_entry_exit() + def _read_dict_from_string(self, content: str) -> dict: + """ + Read the dictionary from the string. + """ + config_dict = {} + for line in content.strip().split("\n"): + if "=" in line and not line.strip().startswith("#"): # Skip comments + key, value = line.split("=", 1) # Split only on first '=' + config_dict[key.strip()] = value.strip() + return config_dict + + @log_function_entry_exit() + def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch_name: str = None) -> str: + """ + Read the pull request directory from the file in the given branch. + """ + # set default values for task pointer file and branch name + if task_pointer_file is None: + task_pointer_file = self.description.task_object.remote_file_path + if branch_name is None: + branch_name = self.git_repo.default_branch + log_message(LoggingScope.TASK_OPS, "INFO", "reading pull request directory from file '%s' in branch '%s'", + task_pointer_file, branch_name) + + # read the pull request directory from the file in the given branch + content = self.git_repo.get_contents(task_pointer_file, ref=branch_name) + + # Decode the content from base64 + content_str = content.decoded_content.decode("utf-8") + + # Parse into dictionary + config_dict = self._read_dict_from_string(content_str) + + target_dir = config_dict.get("target_dir", None) + return config_dict.get("pull_request_dir", target_dir) + + @log_function_entry_exit() + def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str: + """Determine the pull request directory via the task pointer file""" + return self._read_pull_request_dir_from_file(task_pointer_file=task_pointer_file, branch_name=branch_name) + + @log_function_entry_exit() + def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: + """ + Get a branch object from its name. + """ + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + + try: + branch = self.git_repo.get_branch(branch_name) + log_message(LoggingScope.TASK_OPS, "INFO", "branch '%s' exists: '%s'", branch_name, branch) + return branch + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "error checking if branch '%s' exists: '%s'", + branch_name, err) + return None + + @log_function_entry_exit() + def _read_task_state_from_file(self, path: str, branch_name: str = None) -> EESSITaskState: + """ + Read the task state from the file in the given branch. + """ + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + content = self.git_repo.get_contents(path, ref=branch_name) + + # Decode the content from base64 + content_str = content.decoded_content.decode("utf-8").strip() + log_message(LoggingScope.TASK_OPS, "INFO", "content in TaskState file: '%s'", content_str) + + task_state = EESSITaskState.from_string(content_str) + log_message(LoggingScope.TASK_OPS, "INFO", "task state: '%s'", task_state) + + return task_state + + @log_function_entry_exit() + def determine_state(self, branch: str = None) -> EESSITaskState: + """ + Determine the state of the task based on the state of the staging repository. + """ + # check if path representing the task file exists in the default branch or the "feature" branch + task_pointer_file = self.description.task_object.remote_file_path + branch_to_use = self.git_repo.default_branch if branch is None else branch + + if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use): + log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' exists in branch '%s'", + task_pointer_file, branch_to_use) + + # get state from task file in branch to use + # - read the EESSITaskState file in pull request directory + pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use) + log_message(LoggingScope.TASK_OPS, "INFO", "pull request directory: '%s'", pull_request_dir) + task_state_file_path = f"{pull_request_dir}/TaskState" + log_message(LoggingScope.TASK_OPS, "INFO", "task state file path: '%s'", task_state_file_path) + task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) + + log_message(LoggingScope.TASK_OPS, "INFO", "task state in branch '%s': %s", + branch_to_use, task_state) + return task_state + else: + log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' does not exist in branch '%s'", + task_pointer_file, branch_to_use) + return EESSITaskState.UNDETERMINED + + @log_function_entry_exit() + def handle(self): + """ + Dynamically find and execute the appropriate handler based on action and state. + """ + state_before_handle = self.determine_state() + + # Construct handler method name + handler_name = f"_handle_{self.action}_{str(state_before_handle).lower()}" + + # Check if the handler exists + handler = getattr(self, handler_name, None) + + if handler and callable(handler): + # Execute the handler if it exists + return handler() + else: + # Default behavior for missing handlers + log_message(LoggingScope.TASK_OPS, "ERROR", + "No handler for action '%s' and state '%s' implemented; nothing to be done", + self.action, state_before_handle) + return state_before_handle + + # Implement handlers for ADD action + @log_function_entry_exit() + def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None): + """Create a file in the given branch.""" + try: + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + existing_file = self.git_repo.get_contents(path, ref=branch_name) + log_message(LoggingScope.TASK_OPS, "INFO", "File '%s' already exists", path) + return existing_file + except GithubException as err: + if err.status == 404: # File doesn't exist + # Safe to create + return self.git_repo.create_file(path, message, content, branch=branch_name) + else: + raise err # Some other error + + @log_function_entry_exit() + def _create_multi_file_commit(self, files_data, commit_message, branch_name: str = None): + """ + Create a commit with multiple file changes + + files_data: dict with structure: + { + "path/to/file1.txt": { + "content": "file content", + "mode": "100644" # optional, defaults to 100644 + }, + "path/to/file2.py": { + "content": "print('hello')", + "mode": "100644" + } + } + """ + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + ref = self.git_repo.get_git_ref(f"heads/{branch_name}") + current_commit = self.git_repo.get_git_commit(ref.object.sha) + base_tree = current_commit.tree + + # Create tree elements + tree_elements = [] + for file_path, file_info in files_data.items(): + content = file_info["content"] + if isinstance(content, str): + content = content.encode("utf-8") + + blob = self.git_repo.create_git_blob( + base64.b64encode(content).decode("utf-8"), + "base64" + ) + tree_elements.append(InputGitTreeElement( + path=file_path, + mode=file_info.get("mode", "100644"), + type="blob", + sha=blob.sha + )) + + # Create new tree + new_tree = self.git_repo.create_git_tree(tree_elements, base_tree) + + # Create commit + new_commit = self.git_repo.create_git_commit( + commit_message, + new_tree, + [current_commit] + ) + + # Update branch reference + ref.edit(new_commit.sha) + + return new_commit + + @log_function_entry_exit() + def _update_file( + self, file_path: str, new_content: str, commit_message: str, branch_name: str = None + ) -> Optional[Dict]: + try: + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + + # get the current file + file = self.git_repo.get_contents(file_path, ref=branch_name) + + # update the file + result = self.git_repo.update_file( + path=file_path, + message=commit_message, + content=new_content, + sha=file.sha, + branch=branch_name + ) + + log_message(LoggingScope.TASK_OPS, "INFO", + "File updated successfully. Commit SHA: '%s'", result["commit"].sha) + return result + + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "Error updating file: '%s'", err) + return None + + @log_function_entry_exit() + def _sorted_list_of_sequence_numbers(self) -> List[int]: + """Create a sorted list of sequence numbers from the pull requests directory""" + # a pull request's directory is of the form REPO/PR/SEQ + # hence, we can get all sequence numbers from the pull requests directory REPO/PR + sequence_numbers = [] + repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}" + + # iterate over all directories under repo_pr_dir + try: + directories = self._list_directory_contents(repo_pr_dir) + for dir in directories: + # check if the directory is a number + if dir.name.isdigit(): + sequence_numbers.append(int(dir.name)) + else: + # directory is not a number, so we skip it + continue + except FileNotFoundError: + # repo_pr_dir does not exist, so we return an empty dictionary + log_message(LoggingScope.TASK_OPS, "ERROR", "Pull requests directory '%s' does not exist", repo_pr_dir) + except GithubException as err: + if err.status != 404: # 404 is catched by FileNotFoundError + # some other error than the directory not existing + log_message(LoggingScope.TASK_OPS, "ERROR", + "Some other error than the directory not existing: '%s'", err) + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "Unexpected error: '%s'", err) + + return sorted(sequence_numbers) + + @log_function_entry_exit() + def _determine_sequence_number(self) -> int: + """Determine the sequence number for the task""" + + sequence_numbers = self._sorted_list_of_sequence_numbers() + log_message(LoggingScope.TASK_OPS, "INFO", "number of sequence numbers: %d", len(sequence_numbers)) + if len(sequence_numbers) == 0: + return 0 + + log_message(LoggingScope.TASK_OPS, "INFO", "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers))) + + # get the highest sequence number + highest_sequence_number = sequence_numbers[-1] + log_message(LoggingScope.TASK_OPS, "INFO", "highest sequence number: %d", highest_sequence_number) + + pull_request = self._find_pr_for_sequence_number(highest_sequence_number) + log_message(LoggingScope.TASK_OPS, "INFO", "pull request: '%s'", pull_request) + + if pull_request is None: + log_message(LoggingScope.TASK_OPS, "INFO", "Did not find pull request for sequence number %d", + highest_sequence_number) + # the directory for the sequence number exists but no PR yet + return highest_sequence_number + else: + log_message(LoggingScope.TASK_OPS, "INFO", "pull request found: '%s'", pull_request) + log_message(LoggingScope.TASK_OPS, "INFO", "pull request state/merged: '%s/%s'", + pull_request.state, str(pull_request.is_merged())) + if pull_request.is_merged(): + # the PR is merged, so we use the next sequence number + return highest_sequence_number + 1 + else: + # the PR is not merged, so we can use the current sequence number + return highest_sequence_number + + @log_function_entry_exit() + def _handle_add_undetermined(self): + """Handler for ADD action in UNDETERMINED state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in UNDETERMINED state: '%s'", + self.description.get_task_file_name()) + # task is in state UNDETERMINED if there is no pull request directory for the task yet + # + # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/) + # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription) + # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState) + # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR) + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_number = self._determine_sequence_number() # corresponds to an open or yet to be created PR + task_file_name = self.description.get_task_file_name() + # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file + # and we don't have one yet + pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + task_description_file_path = f"{pull_request_dir}/TaskDescription" + task_state_file_path = f"{pull_request_dir}/TaskState" + remote_file_path = self.description.task_object.remote_file_path + + files_to_commit = { + task_description_file_path: { + "content": self.description.get_contents(), + "mode": "100644" + }, + task_state_file_path: { + "content": f"{EESSITaskState.NEW_TASK.name}\n", + "mode": "100644" + }, + remote_file_path: { + "content": f"remote_file_path = {remote_file_path}\npull_request_dir = {pull_request_dir}", + "mode": "100644" + } + } + + branch_name = self.git_repo.default_branch + try: + commit = self._create_multi_file_commit( + files_to_commit, + f"new task for {repo_name} PR {pr_number} seq {sequence_number}", + branch_name=branch_name + ) + log_message(LoggingScope.TASK_OPS, "INFO", "commit created: '%s'", commit) + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "Error creating commit: '%s'", err) + # TODO: rollback previous changes (task description file, task state file) + return EESSITaskState.UNDETERMINED + + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number + # is still open or yet to be created); if it is not valid, perform corrective actions + return EESSITaskState.NEW_TASK + + @log_function_entry_exit() + def _update_task_state_file(self, next_state: EESSITaskState, branch_name: str = None) -> Optional[Dict]: + """Update the TaskState file content in default or given branch""" + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + + task_pointer_file = self.description.task_object.remote_file_path + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_name) + task_state_file_path = f"{pull_request_dir}/TaskState" + arch = self.description.get_metadata_filename_components()[3] + commit_message = f"change task state to {next_state} in {branch_name} for {arch}" + result = self._update_file(task_state_file_path, + f"{next_state.name}\n", + commit_message, + branch_name=branch_name) + return result + + @log_function_entry_exit() + def _init_payload_object(self): + """Initialize the payload object""" + if self.payload is not None: + log_message(LoggingScope.TASK_OPS, "INFO", "payload object already initialized") + return + + # get name of of payload from metadata + payload_name = self.description.metadata["payload"]["filename"] + log_message(LoggingScope.TASK_OPS, "INFO", "payload_name: '%s'", payload_name) + + # get config and remote_client from self.description.task_object + config = self.description.task_object.config + remote_client = self.description.task_object.remote_client + + # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object + # with payload_name + description_remote_file_path = self.description.task_object.remote_file_path + payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name) + log_message(LoggingScope.TASK_OPS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path) + + # initialize payload object + payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) + self.payload = EESSITaskPayload(payload_object) + log_message(LoggingScope.TASK_OPS, "INFO", "payload: '%s'", self.payload) + + @log_function_entry_exit() + def _handle_add_new_task(self): + """Handler for ADD action in NEW_TASK state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in NEW_TASK state: '%s'", + self.description.get_task_file_name()) + # determine next state + next_state = self._next_state(EESSITaskState.NEW_TASK) + log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s'", next_state) + + # initialize payload object + self._init_payload_object() + + # update TaskState file content + self._update_task_state_file(next_state) + + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number + # is still open or yet to be created); if it is not valid, perform corrective actions + return next_state + + @log_function_entry_exit() + def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: + """ + Find the single PR for the given branch in any state. + + Args: + repo: GitHub repository + branch_name: Name of the branch + + Returns: + PullRequest object if found, None otherwise + """ + try: + prs = [pr for pr in list(self.git_repo.get_pulls(state="all")) + if pr.head.ref == branch_name] + log_message(LoggingScope.TASK_OPS, "INFO", "number of PRs found: %d", len(prs)) + if len(prs): + log_message(LoggingScope.TASK_OPS, "INFO", "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref) + return prs[0] if prs else None + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "Error finding PR for branch '%s': '%s'", branch_name, err) + return None + + @log_function_entry_exit() + def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullRequest]: + """Find the PR for the given sequence number""" + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + + # list all PRs with head_ref starting with the feature branch name without the sequence number + last_dash = feature_branch_name.rfind("-") + if last_dash != -1: + head_ref_wout_seq_num = feature_branch_name[:last_dash + 1] # +1 to include the separator + else: + head_ref_wout_seq_num = feature_branch_name + + log_message(LoggingScope.TASK_OPS, "INFO", + "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num) + + all_prs = [pr for pr in list(self.git_repo.get_pulls(state="all")) + if pr.head.ref.startswith(head_ref_wout_seq_num)] + log_message(LoggingScope.TASK_OPS, "INFO", " number of PRs found: %d", len(all_prs)) + for pr in all_prs: + log_message(LoggingScope.TASK_OPS, "INFO", " PR #%d: '%s'", pr.number, pr.head.ref) + + # now, find the PR for the feature branch name (if any) + log_message(LoggingScope.TASK_OPS, "INFO", + "searching PR for feature branch name: '%s'", feature_branch_name) + pull_request = self._find_pr_for_branch(feature_branch_name) + log_message(LoggingScope.TASK_OPS, "INFO", "pull request for branch '%s': '%s'", + feature_branch_name, pull_request) + return pull_request + + @log_function_entry_exit() + def _determine_sequence_number_from_pull_request_directory(self) -> int: + """Determine the sequence number from the pull request directory name""" + task_pointer_file = self.description.task_object.remote_file_path + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + _, _, _, seq, _ = pull_request_dir.split("/") + return int(seq) + + @log_function_entry_exit() + def _determine_feature_branch_name(self) -> str: + """Determine the feature branch name from the pull request directory name""" + task_pointer_file = self.description.task_object.remote_file_path + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + org, repo, pr, seq, _ = pull_request_dir.split("/") + return f"{org}-{repo}-PR-{pr}-SEQ-{seq}" + + @log_function_entry_exit() + def _sync_task_state_file(self, source_branch: str, target_branch: str): + """Update task state file from source to target branch""" + task_pointer_file = self.description.task_object.remote_file_path + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + task_state_file_path = f"{pull_request_dir}/TaskState" + + try: + # get content from source branch + source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch) + + # get current file in target branch + target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch) + + # update if content is different + if source_content.sha != target_file.sha: + result = self.git_repo.update_file( + path=task_state_file_path, + message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}", + content=source_content.decoded_content, + sha=target_file.sha, + branch=target_branch + ) + log_message(LoggingScope.TASK_OPS, "INFO", "Updated '%s'", task_state_file_path) + return result + else: + log_message(LoggingScope.TASK_OPS, "INFO", "No changes needed for '%s'", task_state_file_path) + return None + + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", "Error syncing task state file: '%s'", err) + return None + + @log_function_entry_exit() + def _update_task_states(self, next_state: EESSITaskState, default_branch_name: str, + approved_state: EESSITaskState, feature_branch_name: str): + """ + Update task states in default and feature branches + + States have to be updated in a specific order and in particular the default branch has to be + merged into the feature branch before the feature branch can be updated to avoid a merge conflict. + + Args: + next_state: next state to be applied to the default branch + default_branch_name: name of the default branch + approved_state: state to be applied to the feature branch + feature_branch_name: name of the feature branch + """ + # TODO: add failure handling (capture failures and return them somehow) + + # update TaskState file content + # - next_state in default branch (interpreted as current state) + # - approved_state in feature branch (interpreted as future state, ie, after + # the PR corresponding to the feature branch will be merged) + + # first, update the task state file in the default branch + self._update_task_state_file(next_state, branch_name=default_branch_name) + + # second, merge default branch into feature branch (to avoid a merge conflict) + # TODO: store arch info (CPU+ACCEL) in task/metdata file and then access that rather + # than using a part of the file name + arch = self.description.get_metadata_filename_components()[3] + commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}" + self.git_repo.merge( + head=default_branch_name, + base=feature_branch_name, + commit_message=commit_message + ) + + # last, update task state file in feature branch + self._update_task_state_file(approved_state, branch_name=feature_branch_name) + log_message(LoggingScope.TASK_OPS, "INFO", + "TaskState file updated to '%s' in default branch '%s' and to '%s' in feature branch '%s'", + next_state, default_branch_name, approved_state, feature_branch_name) + + @log_function_entry_exit() + def _create_task_summary(self) -> str: + """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory.""" + + # determine task summary file path in feature branch on GitHub + feature_branch_name = self._determine_feature_branch_name() + pull_request_dir = self._determine_pull_request_dir(branch_name=feature_branch_name) + task_summary_file_path = f"{pull_request_dir}/TaskSummary.html" + + # check if task summary file already exists in repo on GitHub + if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): + log_message(LoggingScope.TASK_OPS, "INFO", "task summary file already exists: '%s'", task_summary_file_path) + task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + # return task_summary.decoded_content + return task_summary + + # create task summary + payload_name = self.description.metadata["payload"]["filename"] + payload_summary = self.payload.analyse_contents(self.config) + metadata_contents = self.description.get_contents() + + task_summary = self.config["github"]["task_summary_payload_template"].format( + payload_name=payload_name, + metadata_contents=metadata_contents, + payload_overview=payload_summary + ) + + # create HTML file with task summary in REPO-PR-SEQ directory + # TODO: add failure handling (capture result and act on it) + task_file_name = self.description.get_task_file_name() + commit_message = f"create summary for {task_file_name} in {feature_branch_name}" + self._safe_create_file(task_summary_file_path, commit_message, task_summary, + branch_name=feature_branch_name) + log_message(LoggingScope.TASK_OPS, "INFO", "task summary file created: '%s'", task_summary_file_path) + + # return task summary + return task_summary + + @log_function_entry_exit() + def _create_pr_contents_overview(self) -> str: + """Create a contents overview for the pull request""" + # TODO: implement + feature_branch_name = self._determine_feature_branch_name() + task_pointer_file = self.description.task_object.remote_file_path + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, feature_branch_name) + pr_dir = os.path.dirname(pull_request_dir) + directories = self._list_directory_contents(pr_dir, feature_branch_name) + contents_overview = "" + if directories: + contents_overview += "\n" + for directory in directories: + task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html" + if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): + file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + task_summary = base64.b64decode(file_contents.content).decode("utf-8") + contents_overview += f"{task_summary}\n" + else: + contents_overview += f"Task summary file not found: {task_summary_file_path}\n" + contents_overview += "\n" + else: + contents_overview += "No tasks found in this PR\n" + + print(f"contents_overview: {contents_overview}") + return contents_overview + + @log_function_entry_exit() + def _create_pull_request(self, feature_branch_name: str, default_branch_name: str): + """ + Create a PR from the feature branch to the default branch + + Args: + feature_branch_name: name of the feature branch + default_branch_name: name of the default branch + """ + pr_title_format = self.config["github"]["grouped_pr_title"] + pr_body_format = self.config["github"]["grouped_pr_body"] + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" + seq_num = self._determine_sequence_number_from_pull_request_directory() + pr_title = pr_title_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + repo=repo_name, + seq_num=seq_num, + ) + self._create_task_summary() + contents_overview = self._create_pr_contents_overview() + pr_body = pr_body_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + pr_url=pr_url, + repo=repo_name, + seq_num=seq_num, + contents=contents_overview, + analysis="
TO BE DONE
", + action="
TO BE DONE
", + ) + pr = self.git_repo.create_pull( + title=pr_title, + body=pr_body, + head=feature_branch_name, + base=default_branch_name + ) + log_message(LoggingScope.TASK_OPS, "INFO", "PR created: '%s'", pr) + + @log_function_entry_exit() + def _update_pull_request(self, pull_request: PullRequest): + """ + Update the pull request + + Args: + pull_request: instance of the pull request + """ + # TODO: update sections (contents analysis, action) + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" + seq_num = self._determine_sequence_number_from_pull_request_directory() + + self._create_task_summary() + contents_overview = self._create_pr_contents_overview() + pr_body_format = self.config["github"]["grouped_pr_body"] + pr_body = pr_body_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + pr_url=pr_url, + repo=repo_name, + seq_num=seq_num, + contents=contents_overview, + analysis="
TO BE DONE
", + action="
TO BE DONE
", + ) + pull_request.edit(body=pr_body) + + log_message(LoggingScope.TASK_OPS, "INFO", "PR updated: '%s'", pull_request) + + @log_function_entry_exit() + def _handle_add_payload_staged(self): + """Handler for ADD action in PAYLOAD_STAGED state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in PAYLOAD_STAGED state: '%s'", + self.description.get_task_file_name()) + next_state = self._next_state(EESSITaskState.PAYLOAD_STAGED) + approved_state = EESSITaskState.APPROVED + log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s', approved_state: '%s'", next_state, approved_state) + + default_branch_name = self.git_repo.default_branch + default_branch = self._get_branch_from_name(default_branch_name) + default_sha = default_branch.commit.sha + feature_branch_name = self._determine_feature_branch_name() + feature_branch = self._get_branch_from_name(feature_branch_name) + if not feature_branch: + # feature branch does not exist + # TODO: could have been merged already --> check if PR corresponding to the feature branch exists + # ASSUME: it has not existed before --> create it + log_message(LoggingScope.TASK_OPS, "INFO", + "branch '%s' does not exist, creating it", feature_branch_name) + + feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha) + log_message(LoggingScope.TASK_OPS, "INFO", + "branch '%s' created: '%s'", feature_branch_name, feature_branch) + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "found existing branch for '%s': '%s'", feature_branch_name, feature_branch) + + pull_request = self._find_pr_for_branch(feature_branch_name) + if not pull_request: + log_message(LoggingScope.TASK_OPS, "INFO", + "no PR found for branch '%s'", feature_branch_name) + + # TODO: add failure handling (capture result and act on it) + self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) + + # TODO: add failure handling (capture result and act on it) + self._create_pull_request(feature_branch_name, default_branch_name) + + return EESSITaskState.PULL_REQUEST + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "found existing PR for branch '%s': '%s'", feature_branch_name, pull_request) + # TODO: check if PR is open or closed + if pull_request.state == "closed": + log_message(LoggingScope.TASK_OPS, "INFO", + "PR '%s' is closed, creating issue", pull_request) + # TODO: create issue + return EESSITaskState.PAYLOAD_STAGED + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "PR '%s' is open, updating task states", pull_request) + # TODO: add failure handling (capture result and act on it) + # THINK about what a failure would mean and what to do about it. + self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) + + # TODO: add failure handling (capture result and act on it) + self._update_pull_request(pull_request) + + return EESSITaskState.PULL_REQUEST + + @log_function_entry_exit() + def _handle_add_pull_request(self): + """Handler for ADD action in PULL_REQUEST state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in PULL_REQUEST state: '%s'", + self.description.get_task_file_name()) + # Implementation for adding in PULL_REQUEST state + # we got here because the state of the task is PULL_REQUEST in the default branch + # determine branch and PR and state of PR + # PR is open --> just return EESSITaskState.PULL_REQUEST + # PR is closed & merged --> deployment is approved + # PR is closed & not merged --> deployment is rejected + feature_branch_name = self._determine_feature_branch_name() + # TODO: check if feature branch exists, for now ASSUME it does + pull_request = self._find_pr_for_branch(feature_branch_name) + if pull_request: + log_message(LoggingScope.TASK_OPS, "INFO", + "found PR for branch '%s': '%s'", feature_branch_name, pull_request) + if pull_request.state == "closed": + if pull_request.merged: + log_message(LoggingScope.TASK_OPS, "INFO", + "PR '%s' is closed and merged, returning APPROVED state", pull_request) + # TODO: How could we ended up here? state in default branch is PULL_REQUEST but + # PR is merged, hence it should have been in the APPROVED state + # ==> for now, just return EESSITaskState.PULL_REQUEST + # + # there is the possibility that the PR was updated just before the + # PR was merged + # WHY is it a problem? because a task may have been accepted that wouldn't + # have been accepted or worse shouldn't been accepted + # WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw + # HOWEVER, the contents of the PR directory may be inconsistent with + # respect to the TaskState file and missing TaskSummary.html file + # WE could create an issue and only return EESSITaskState.APPROVED if the + # issue is closed + # WE could also defer all handling of this to the handler for the + # APPROVED state + # NOPE, we have to do some handling here, at least for the tasks where their + # state file did + # --> check if we could have ended up here? If so, create an issue. + # Do we need a state ISSUE_OPENED to avoid processing the task again? + return EESSITaskState.PULL_REQUEST + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "PR '%s' is closed and not merged, returning REJECTED state", pull_request) + # TODO: there is the possibility that the PR was updated just before the + # PR was closed + # WHY is it a problem? because a task may have been rejected that wouldn't + # have been rejected or worse shouldn't been rejected + # WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw + # HOWEVER, the contents of the PR directory may be inconsistent with + # respect to the TaskState file and missing TaskSummary.html file + # WE could create an issue and only return EESSITaskState.REJECTED if the + # issue is closed + # WE could also defer all handling of this to the handler for the + # REJECTED state + # FOR NOW, we assume that the task was rejected on purpose + # we need to change the state of the task in the default branch to REJECTED + self._update_task_state_file(EESSITaskState.REJECTED) + return EESSITaskState.REJECTED + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "PR '%s' is open, returning PULL_REQUEST state", pull_request) + return EESSITaskState.PULL_REQUEST + else: + log_message(LoggingScope.TASK_OPS, "INFO", + "no PR found for branch '%s'", feature_branch_name) + # the method was called because the state of the task is PULL_REQUEST in the default branch + # however, it's weird that the PR was not found for the feature branch + # TODO: may create or update an issue for the task or deployment + return EESSITaskState.PULL_REQUEST + + return EESSITaskState.PULL_REQUEST + + @log_function_entry_exit() + def _perform_task_action(self) -> bool: + """Perform the task action""" + # TODO: support other actions than ADD + if self.action == EESSITaskAction.ADD: + return self._perform_task_add() + else: + raise ValueError(f"Task action '{self.action}' not supported (yet)") + + @log_function_entry_exit() + def _issue_exists(self, title: str, state: str = "open") -> bool: + """ + Check if an issue with the given title and state already exists. + """ + issues = self.git_repo.get_issues(state=state) + for issue in issues: + if issue.title == title and issue.state == state: + return True + else: + return False + + @log_function_entry_exit() + def _perform_task_add(self) -> bool: + """Perform the ADD task action""" + # TODO: verify checksum here or before? + script = self.config["paths"]["ingestion_script"] + sudo = ["sudo"] if self.config["cvmfs"].getboolean("ingest_as_root", True) else [] + log_message(LoggingScope.STATE_OPS, "INFO", + "Running the ingestion script for '%s'...\n with script: '%s'\n with sudo: '%s'", + self.description.get_task_file_name(), + script, "no" if sudo == [] else "yes") + ingest_cmd = subprocess.run( + sudo + [script, self.cvmfs_repo, str(self.payload.payload_object.local_file_path)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + log_message(LoggingScope.STATE_OPS, "INFO", + "Ingestion script returned code '%s'", ingest_cmd.returncode) + log_message(LoggingScope.STATE_OPS, "INFO", + "Ingestion script stdout: '%s'", ingest_cmd.stdout.decode("UTF-8")) + log_message(LoggingScope.STATE_OPS, "INFO", + "Ingestion script stderr: '%s'", ingest_cmd.stderr.decode("UTF-8")) + if ingest_cmd.returncode == 0: + next_state = self._next_state(EESSITaskState.APPROVED) + self._update_task_state_file(next_state) + if self.config.has_section("slack") and self.config["slack"].getboolean("ingestion_notification", False): + send_slack_message( + self.config["secrets"]["slack_webhook"], + self.config["slack"]["ingestion_message"].format( + tarball=os.path.basename(self.payload.payload_object.local_file_path), + cvmfs_repo=self.cvmfs_repo) + ) + return True + else: + tarball = os.path.basename(self.payload.payload_object.local_file_path) + log_message(LoggingScope.STATE_OPS, "ERROR", + "Failed to add '%s', return code '%s'", + tarball, + ingest_cmd.returncode) + + issue_title = f"Failed to add '{tarball}'" + log_message(LoggingScope.STATE_OPS, "INFO", + "Creating issue for failed ingestion: title: '%s'", + issue_title) + + command = " ".join(ingest_cmd.args) + failed_ingestion_issue_body = self.config["github"]["failed_ingestion_issue_body"] + issue_body = failed_ingestion_issue_body.format( + command=command, + tarball=tarball, + return_code=ingest_cmd.returncode, + stdout=ingest_cmd.stdout.decode("UTF-8"), + stderr=ingest_cmd.stderr.decode("UTF-8") + ) + log_message(LoggingScope.STATE_OPS, "INFO", + "Creating issue for failed ingestion: body: '%s'", + issue_body) + + if self._issue_exists(issue_title, state="open"): + log_message(LoggingScope.STATE_OPS, "INFO", + "Failed to add '%s', but an open issue already exists, skipping...", + os.path.basename(self.payload.payload_object.local_file_path)) + else: + log_message(LoggingScope.STATE_OPS, "INFO", + "Failed to add '%s', but an open issue does not exist, creating one...", + os.path.basename(self.payload.payload_object.local_file_path)) + self.git_repo.create_issue(title=issue_title, body=issue_body) + return False + + @log_function_entry_exit() + def _handle_add_approved(self): + """Handler for ADD action in APPROVED state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in APPROVED state: '%s'", + self.description.get_task_file_name()) + # Implementation for adding in APPROVED state + # If successful, _perform_task_action() will change the state + # to INGESTED on GitHub + try: + if self._perform_task_action(): + return EESSITaskState.INGESTED + else: + return EESSITaskState.APPROVED + except Exception as err: + log_message(LoggingScope.TASK_OPS, "ERROR", + "Error performing task action: '%s'\nTraceback:\n%s", err, traceback.format_exc()) + return EESSITaskState.APPROVED + + @log_function_entry_exit() + def _handle_add_ingested(self): + """Handler for ADD action in INGESTED state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in INGESTED state: '%s'", + self.description.get_task_file_name()) + # Implementation for adding in INGESTED state + # DONT change state on GitHub, because the result + # (INGESTED/REJECTED) would be overwritten + return EESSITaskState.DONE + + @log_function_entry_exit() + def _handle_add_rejected(self): + """Handler for ADD action in REJECTED state""" + log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in REJECTED state: '%s'", + self.description.get_task_file_name()) + # Implementation for adding in REJECTED state + # DONT change state on GitHub, because the result + # (INGESTED/REJECTED) would be overwritten + return EESSITaskState.DONE + + @log_function_entry_exit() + def __str__(self): + return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})" diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py index 0662fc83..15fb416c 100644 --- a/scripts/automated_ingestion/ingest_bundles.py +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from eessi_data_object import EESSIDataAndSignatureObject -# from eessi_task import EESSITask, TaskState +from eessi_task import EESSITask from eessi_task_description import EESSITaskDescription from eessi_s3_bucket import EESSIS3Bucket from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes @@ -10,10 +10,9 @@ import argparse import configparser -# import github +import github import json import logging -# import os import sys from pathlib import Path from typing import List @@ -152,8 +151,8 @@ def main(): # TODO: check configuration: secrets, paths, permissions on dirs, etc extensions = args.extensions.split(",") - # gh_pat = config["secrets"]["github_pat"] - # gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"]) + gh_pat = config["secrets"]["github_pat"] + gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"]) buckets = json.loads(config["aws"]["staging_buckets"]) for bucket, cvmfs_repo in buckets.items(): @@ -171,21 +170,20 @@ def main(): log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path) try: - _ = EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)) -# # create EESSITask for the task file -# try: -# task = EESSITask( -# EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)), -# config, cvmfs_repo, gh_staging_repo -# ) -# -# except Exception as err: -# log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task %s: %s", -# task_path, str(err)) -# continue -# -# log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task) -# + # create EESSITask for the task file + try: + task = EESSITask( + EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)), + config, cvmfs_repo, gh_staging_repo + ) + + except Exception as err: + log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task '%s': '%s'", + task_path, str(err)) + continue + + log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task) + # previous_state = None # current_state = task.determine_state() # log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'", From 530ac6dd769d121b070ca25127141dfcabc3614f Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 20:33:08 +0200 Subject: [PATCH 19/26] add TASK_OPS_DETAILS log scope and reduce log noise --- scripts/automated_ingestion/eessi_logging.py | 21 +-- scripts/automated_ingestion/eessi_task.py | 120 +++++++++--------- scripts/automated_ingestion/ingest_bundles.py | 2 +- 3 files changed, 75 insertions(+), 68 deletions(-) diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py index 857d92ca..f94c8495 100644 --- a/scripts/automated_ingestion/eessi_logging.py +++ b/scripts/automated_ingestion/eessi_logging.py @@ -21,17 +21,18 @@ class LoggingScope(IntFlag): """Enumeration of different logging scopes.""" NONE = 0 - FUNC_ENTRY_EXIT = auto() # Function entry/exit logging - DOWNLOAD = auto() # Logging related to file downloads - VERIFICATION = auto() # Logging related to signature and checksum verification - STATE_OPS = auto() # Logging related to tarball state operations - GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) - GROUP_OPS = auto() # Logging related to tarball group operations - TASK_OPS = auto() # Logging related to task operations - ERROR = auto() # Error logging (separate from other scopes for easier filtering) - DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) + FUNC_ENTRY_EXIT = auto() # Function entry/exit logging + DOWNLOAD = auto() # Logging related to file downloads + VERIFICATION = auto() # Logging related to signature and checksum verification + STATE_OPS = auto() # Logging related to tarball state operations + GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) + GROUP_OPS = auto() # Logging related to tarball group operations + TASK_OPS = auto() # Logging related to task operations + TASK_OPS_DETAILS = auto() # Logging related to task operations (detailed) + ERROR = auto() # Error logging (separate from other scopes for easier filtering) + DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS | - GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG) + GITHUB_OPS | GROUP_OPS | TASK_OPS | TASK_OPS_DETAILS | ERROR | DEBUG) # Global setting for logging scopes diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index f2369db6..da7e49e6 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -34,15 +34,15 @@ class EESSITaskState(Enum): def from_string( cls, name: str, default: Optional["EESSITaskState"] = None, case_sensitive: bool = False ) -> "EESSITaskState": - log_message(LoggingScope.TASK_OPS, "INFO", "from_string: '%s'", name) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string: '%s'", name) if case_sensitive: to_return = cls.__members__.get(name, default) - log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string will return: '%s'", to_return) return to_return try: to_return = cls[name.upper()] - log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string will return: '%s'", to_return) return to_return except KeyError: return default @@ -134,32 +134,32 @@ def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, b # get all files in directory part of file_path_prefix directory_part = os.path.dirname(file_path_prefix) files = self.git_repo.get_contents(directory_part, ref=branch_name) - log_msg = "Found files %s in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name) + log_msg = "Found files '%s' in directory '%s' in branch '%s'" + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", log_msg, files, directory_part, branch_name) # check if any of the files has file_path_prefix as prefix for file in files: if file.path.startswith(file_path_prefix): - log_msg = "Found file %s in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name) + log_msg = "Found file '%s' in directory '%s' in branch '%s'" + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", log_msg, file.path, directory_part, branch_name) return True - log_msg = "No file with prefix %s found in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name) + log_msg = "No file with prefix '%s' found in directory '%s' in branch '%s'" + log_message(LoggingScope.TASK_OPS, "INFO", log_msg, file_path_prefix, directory_part, branch_name) return False except UnknownObjectException: # file_path does not exist in branch - log_msg = "Directory %s or file with prefix %s does not exist in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) + log_msg = "Directory '%s' or file with prefix '%s' does not exist in branch '%s'" + log_message(LoggingScope.TASK_OPS, "INFO", log_msg, directory_part, file_path_prefix, branch_name) return False except GithubException as err: if err.status == 404: # file_path does not exist in branch - log_msg = "Directory %s or file with prefix %s does not exist in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) + log_msg = "Directory '%s' or file with prefix '%s' does not exist in branch '%s'" + log_message(LoggingScope.TASK_OPS, "INFO", log_msg, directory_part, file_path_prefix, branch_name) return False else: # if there was some other (e.g. connection) issue, log message and return False - log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' - log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) + log_msg = "Unable to determine the state of '%s', the GitHub API returned status '%s'!" + log_message(LoggingScope.ERROR, "WARNING", log_msg, self.object, err.status) return False return False @@ -323,23 +323,24 @@ def _find_state(self) -> EESSITaskState: The state of the task. """ # obtain repo and pr from metadata - log_message(LoggingScope.TASK_OPS, "INFO", "finding state of task '%s'", self.description.task_object) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "finding state of task '%s'", self.description.task_object) repo = self.description.get_repo_name() pr = self.description.get_pr_number() - log_message(LoggingScope.TASK_OPS, "INFO", "repo: '%s', pr: '%s'", repo, pr) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "repo: '%s', pr: '%s'", repo, pr) # obtain all sequence numbers in repo/pr dir which include a state file for this task sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) if len(sequence_numbers) == 0: # no sequence numbers found, so we return NEW_TASK - log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, state: NEW_TASK") + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no sequence numbers found, state: NEW_TASK") return EESSITaskState.NEW_TASK # we got at least one sequence number # if one value for a sequence number is True, we can determine the state from the file in the directory sequence_including_task = [key for key, value in sequence_numbers.items() if value is True] if len(sequence_including_task) == 0: # no sequence number includes the task file, so we return NEW_TASK - log_message(LoggingScope.TASK_OPS, "INFO", "no sequence number includes the task file, state: NEW_TASK") + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", + "no sequence number includes the task file, state: NEW_TASK") return EESSITaskState.NEW_TASK # we got at least one sequence number which includes the task file # we can determine the state from the filename in the directory @@ -349,7 +350,7 @@ def _find_state(self) -> EESSITaskState: task_file_name = self.description.get_task_file_name() metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}." state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number) - log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) return state @log_function_entry_exit() @@ -377,7 +378,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s if file.path.startswith(metadata_file_state_path_prefix): # get state from file name taking only the suffix state = EESSITaskState.from_string(file.name.split(".")[-1]) - log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) states.append(state) if len(states) == 0: # did not find any file with metadata_file_state_path_prefix as prefix @@ -387,7 +388,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s # sort the states and return the last one states.sort() state = states[-1] - log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) return state @log_function_entry_exit() @@ -398,7 +399,7 @@ def _list_directory_contents(self, directory_path: str, branch_name: str = None) try: # Get contents of the directory branch_name = self.git_repo.default_branch if branch_name is None else branch_name - log_message(LoggingScope.TASK_OPS, "INFO", "listing contents of '%s' in branch '%s'", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "listing contents of '%s' in branch '%s'", directory_path, branch_name) contents = self.git_repo.get_contents(directory_path, ref=branch_name) @@ -524,15 +525,15 @@ def determine_state(self, branch: str = None) -> EESSITaskState: branch_to_use = self.git_repo.default_branch if branch is None else branch if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use): - log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' exists in branch '%s'", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "path '%s' exists in branch '%s'", task_pointer_file, branch_to_use) # get state from task file in branch to use # - read the EESSITaskState file in pull request directory pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use) - log_message(LoggingScope.TASK_OPS, "INFO", "pull request directory: '%s'", pull_request_dir) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request directory: '%s'", pull_request_dir) task_state_file_path = f"{pull_request_dir}/TaskState" - log_message(LoggingScope.TASK_OPS, "INFO", "task state file path: '%s'", task_state_file_path) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "task state file path: '%s'", task_state_file_path) task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) log_message(LoggingScope.TASK_OPS, "INFO", "task state in branch '%s': %s", @@ -573,7 +574,7 @@ def _safe_create_file(self, path: str, message: str, content: str, branch_name: try: branch_name = self.git_repo.default_branch if branch_name is None else branch_name existing_file = self.git_repo.get_contents(path, ref=branch_name) - log_message(LoggingScope.TASK_OPS, "INFO", "File '%s' already exists", path) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "File '%s' already exists", path) return existing_file except GithubException as err: if err.status == 404: # File doesn't exist @@ -656,7 +657,7 @@ def _update_file( branch=branch_name ) - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "File updated successfully. Commit SHA: '%s'", result["commit"].sha) return result @@ -700,18 +701,20 @@ def _determine_sequence_number(self) -> int: """Determine the sequence number for the task""" sequence_numbers = self._sorted_list_of_sequence_numbers() - log_message(LoggingScope.TASK_OPS, "INFO", "number of sequence numbers: %d", len(sequence_numbers)) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "number of sequence numbers: %d", len(sequence_numbers)) if len(sequence_numbers) == 0: + log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, returning 0") return 0 - log_message(LoggingScope.TASK_OPS, "INFO", "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers))) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", + "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers))) # get the highest sequence number highest_sequence_number = sequence_numbers[-1] - log_message(LoggingScope.TASK_OPS, "INFO", "highest sequence number: %d", highest_sequence_number) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "highest sequence number: %d", highest_sequence_number) pull_request = self._find_pr_for_sequence_number(highest_sequence_number) - log_message(LoggingScope.TASK_OPS, "INFO", "pull request: '%s'", pull_request) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request: '%s'", pull_request) if pull_request is None: log_message(LoggingScope.TASK_OPS, "INFO", "Did not find pull request for sequence number %d", @@ -773,7 +776,7 @@ def _handle_add_undetermined(self): f"new task for {repo_name} PR {pr_number} seq {sequence_number}", branch_name=branch_name ) - log_message(LoggingScope.TASK_OPS, "INFO", "commit created: '%s'", commit) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "commit created: '%s'", commit) except Exception as err: log_message(LoggingScope.TASK_OPS, "ERROR", "Error creating commit: '%s'", err) # TODO: rollback previous changes (task description file, task state file) @@ -803,12 +806,12 @@ def _update_task_state_file(self, next_state: EESSITaskState, branch_name: str = def _init_payload_object(self): """Initialize the payload object""" if self.payload is not None: - log_message(LoggingScope.TASK_OPS, "INFO", "payload object already initialized") + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload object already initialized") return # get name of of payload from metadata payload_name = self.description.metadata["payload"]["filename"] - log_message(LoggingScope.TASK_OPS, "INFO", "payload_name: '%s'", payload_name) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload_name: '%s'", payload_name) # get config and remote_client from self.description.task_object config = self.description.task_object.config @@ -818,7 +821,7 @@ def _init_payload_object(self): # with payload_name description_remote_file_path = self.description.task_object.remote_file_path payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name) - log_message(LoggingScope.TASK_OPS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path) # initialize payload object payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) @@ -832,7 +835,7 @@ def _handle_add_new_task(self): self.description.get_task_file_name()) # determine next state next_state = self._next_state(EESSITaskState.NEW_TASK) - log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s'", next_state) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "next_state: '%s'", next_state) # initialize payload object self._init_payload_object() @@ -859,9 +862,10 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: try: prs = [pr for pr in list(self.git_repo.get_pulls(state="all")) if pr.head.ref == branch_name] - log_message(LoggingScope.TASK_OPS, "INFO", "number of PRs found: %d", len(prs)) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "number of PRs found: %d", len(prs)) if len(prs): - log_message(LoggingScope.TASK_OPS, "INFO", "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", + "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref) return prs[0] if prs else None except Exception as err: log_message(LoggingScope.TASK_OPS, "ERROR", "Error finding PR for branch '%s': '%s'", branch_name, err) @@ -881,20 +885,20 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq else: head_ref_wout_seq_num = feature_branch_name - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num) all_prs = [pr for pr in list(self.git_repo.get_pulls(state="all")) if pr.head.ref.startswith(head_ref_wout_seq_num)] - log_message(LoggingScope.TASK_OPS, "INFO", " number of PRs found: %d", len(all_prs)) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", " number of PRs found: %d", len(all_prs)) for pr in all_prs: - log_message(LoggingScope.TASK_OPS, "INFO", " PR #%d: '%s'", pr.number, pr.head.ref) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", " PR #%d: '%s'", pr.number, pr.head.ref) # now, find the PR for the feature branch name (if any) - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "searching PR for feature branch name: '%s'", feature_branch_name) pull_request = self._find_pr_for_branch(feature_branch_name) - log_message(LoggingScope.TASK_OPS, "INFO", "pull request for branch '%s': '%s'", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request for branch '%s': '%s'", feature_branch_name, pull_request) return pull_request @@ -1002,7 +1006,8 @@ def _create_task_summary(self) -> str: # check if task summary file already exists in repo on GitHub if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): - log_message(LoggingScope.TASK_OPS, "INFO", "task summary file already exists: '%s'", task_summary_file_path) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", + "task summary file already exists: '%s'", task_summary_file_path) task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) # return task_summary.decoded_content return task_summary @@ -1024,7 +1029,7 @@ def _create_task_summary(self) -> str: commit_message = f"create summary for {task_file_name} in {feature_branch_name}" self._safe_create_file(task_summary_file_path, commit_message, task_summary, branch_name=feature_branch_name) - log_message(LoggingScope.TASK_OPS, "INFO", "task summary file created: '%s'", task_summary_file_path) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "task summary file created: '%s'", task_summary_file_path) # return task summary return task_summary @@ -1135,7 +1140,8 @@ def _handle_add_payload_staged(self): self.description.get_task_file_name()) next_state = self._next_state(EESSITaskState.PAYLOAD_STAGED) approved_state = EESSITaskState.APPROVED - log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s', approved_state: '%s'", next_state, approved_state) + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", + "next_state: '%s', approved_state: '%s'", next_state, approved_state) default_branch_name = self.git_repo.default_branch default_branch = self._get_branch_from_name(default_branch_name) @@ -1146,19 +1152,19 @@ def _handle_add_payload_staged(self): # feature branch does not exist # TODO: could have been merged already --> check if PR corresponding to the feature branch exists # ASSUME: it has not existed before --> create it - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "branch '%s' does not exist, creating it", feature_branch_name) feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha) - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "branch '%s' created: '%s'", feature_branch_name, feature_branch) else: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "found existing branch for '%s': '%s'", feature_branch_name, feature_branch) pull_request = self._find_pr_for_branch(feature_branch_name) if not pull_request: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no PR found for branch '%s'", feature_branch_name) # TODO: add failure handling (capture result and act on it) @@ -1169,7 +1175,7 @@ def _handle_add_payload_staged(self): return EESSITaskState.PULL_REQUEST else: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "found existing PR for branch '%s': '%s'", feature_branch_name, pull_request) # TODO: check if PR is open or closed if pull_request.state == "closed": @@ -1178,7 +1184,7 @@ def _handle_add_payload_staged(self): # TODO: create issue return EESSITaskState.PAYLOAD_STAGED else: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "PR '%s' is open, updating task states", pull_request) # TODO: add failure handling (capture result and act on it) # THINK about what a failure would mean and what to do about it. @@ -1204,12 +1210,12 @@ def _handle_add_pull_request(self): # TODO: check if feature branch exists, for now ASSUME it does pull_request = self._find_pr_for_branch(feature_branch_name) if pull_request: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "found PR for branch '%s': '%s'", feature_branch_name, pull_request) if pull_request.state == "closed": if pull_request.merged: log_message(LoggingScope.TASK_OPS, "INFO", - "PR '%s' is closed and merged, returning APPROVED state", pull_request) + "PR '%s' is closed and merged (strange that state is PULL_REQUEST)", pull_request) # TODO: How could we ended up here? state in default branch is PULL_REQUEST but # PR is merged, hence it should have been in the APPROVED state # ==> for now, just return EESSITaskState.PULL_REQUEST @@ -1249,11 +1255,11 @@ def _handle_add_pull_request(self): self._update_task_state_file(EESSITaskState.REJECTED) return EESSITaskState.REJECTED else: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "PR '%s' is open, returning PULL_REQUEST state", pull_request) return EESSITaskState.PULL_REQUEST else: - log_message(LoggingScope.TASK_OPS, "INFO", + log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no PR found for branch '%s'", feature_branch_name) # the method was called because the state of the task is PULL_REQUEST in the default branch # however, it's weird that the PR was not found for the feature branch diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py index 15fb416c..b4df5f2f 100644 --- a/scripts/automated_ingestion/ingest_bundles.py +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -182,7 +182,7 @@ def main(): task_path, str(err)) continue - log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task) + log_message(LoggingScope.GROUP_OPS, "INFO", "Created EESSITask: '%s'", task) # previous_state = None # current_state = task.determine_state() From a945ece21e6c6e8ec40a150d2f5a15700e072b00 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 20:55:32 +0200 Subject: [PATCH 20/26] remove unused functions --- scripts/automated_ingestion/eessi_task.py | 205 ---------------------- 1 file changed, 205 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index da7e49e6..850c426a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -219,178 +219,6 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> return {} return sequence_numbers - @log_function_entry_exit() - def _find_highest_number(self, str_list: List[str]) -> int: - """ - Find the highest number in a list of strings. - """ - # Convert all strings to integers - int_list = [int(num) for num in str_list] - return max(int_list) - - @log_function_entry_exit() - def _get_sequence_number_for_task_file(self) -> int: - """ - Get the sequence number this task is assigned to at the moment. - NOTE, should only be called if the task is actually assigned to a sequence number. - """ - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) - if len(sequence_numbers) == 0: - raise ValueError("Found no sequence numbers at all") - else: - # get all entries with value True, there should be only one, so we return the first one - sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True] - if len(sequence_numbers_true) == 0: - raise ValueError("Found no sequence numbers that include the task file for task %s", - self.description) - else: - return sequence_numbers_true[0] - - @log_function_entry_exit() - def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int: - """ - Get the current sequence number based on the sequence numbers. - If sequence_numbers is not provided, we determine the sequence numbers from the task description. - """ - if sequence_numbers is None: - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) - if len(sequence_numbers) == 0: - return 0 - return self._find_highest_number(sequence_numbers.keys()) - - @log_function_entry_exit() - def _get_fixed_sequence_number(self) -> int: - """ - Get a fixed sequence number. - """ - return 11 - - @log_function_entry_exit() - def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]: - """ - Find the staging PR for the task. - TODO: arg sequence number --> make function simpler - """ - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - try: - sequence_number = self._get_sequence_number_for_task_file() - except ValueError: - # no sequence number found, so we return None - log_message(LoggingScope.ERROR, "ERROR", "no sequence number found for task '%s'", self.description) - return None, None, None - except Exception as err: - # some other error - log_message(LoggingScope.ERROR, "ERROR", "error finding staging PR for task '%s': '%s'", - self.description, err) - return None, None, None - branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - if branch_name in [branch.name for branch in self.git_repo.get_branches()]: - find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state="all")] - if find_pr: - pr = find_pr.pop(0) - return pr, branch_name, sequence_number - else: - return None, branch_name, sequence_number - else: - return None, None, None - - @log_function_entry_exit() - def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]: - """ - Create a staging PR for the task. - NOTE, SHALL only be called if no staging PR for the task exists yet. - """ - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - default_branch_name = self.git_repo.default_branch - pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", - body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", - head=branch_name, base=default_branch_name) - return pr, branch_name - - @log_function_entry_exit() - def _find_state(self) -> EESSITaskState: - """ - Determine the state of the task based on the task description metadata. - - Returns: - The state of the task. - """ - # obtain repo and pr from metadata - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "finding state of task '%s'", self.description.task_object) - repo = self.description.get_repo_name() - pr = self.description.get_pr_number() - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "repo: '%s', pr: '%s'", repo, pr) - - # obtain all sequence numbers in repo/pr dir which include a state file for this task - sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) - if len(sequence_numbers) == 0: - # no sequence numbers found, so we return NEW_TASK - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no sequence numbers found, state: NEW_TASK") - return EESSITaskState.NEW_TASK - # we got at least one sequence number - # if one value for a sequence number is True, we can determine the state from the file in the directory - sequence_including_task = [key for key, value in sequence_numbers.items() if value is True] - if len(sequence_including_task) == 0: - # no sequence number includes the task file, so we return NEW_TASK - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", - "no sequence number includes the task file, state: NEW_TASK") - return EESSITaskState.NEW_TASK - # we got at least one sequence number which includes the task file - # we can determine the state from the filename in the directory - # NOTE, we use the first element in sequence_including_task (there should be only one) - # we ignore other elements in sequence_including_task - sequence_number = sequence_including_task[0] - task_file_name = self.description.get_task_file_name() - metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}." - state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number) - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) - return state - - @log_function_entry_exit() - def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str, - sequence_number: int) -> EESSITaskState: - """ - Get the state from the file in the metadata_file_state_path_prefix. - """ - # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, - # INGESTED, DONE) - # we need to check the task file in the default branch or in the branch corresponding to the sequence number - directory_part = os.path.dirname(metadata_file_state_path_prefix) - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - default_branch_name = self.git_repo.default_branch - branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - all_branch_names = [branch.name for branch in self.git_repo.get_branches()] - states = [] - for branch in [default_branch_name, branch_name]: - if branch in all_branch_names: - # first get all files in directory part of metadata_file_state_path_prefix - files = self._list_directory_contents(directory_part, branch) - # check if any of the files has metadata_file_state_path_prefix as prefix - for file in files: - if file.path.startswith(metadata_file_state_path_prefix): - # get state from file name taking only the suffix - state = EESSITaskState.from_string(file.name.split(".")[-1]) - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) - states.append(state) - if len(states) == 0: - # did not find any file with metadata_file_state_path_prefix as prefix - log_message(LoggingScope.TASK_OPS, "INFO", "did not find any file with prefix '%s'", - metadata_file_state_path_prefix) - return EESSITaskState.NEW_TASK - # sort the states and return the last one - states.sort() - state = states[-1] - log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state) - return state - @log_function_entry_exit() def _list_directory_contents(self, directory_path: str, branch_name: str = None) -> List[Any]: """ @@ -920,39 +748,6 @@ def _determine_feature_branch_name(self) -> str: org, repo, pr, seq, _ = pull_request_dir.split("/") return f"{org}-{repo}-PR-{pr}-SEQ-{seq}" - @log_function_entry_exit() - def _sync_task_state_file(self, source_branch: str, target_branch: str): - """Update task state file from source to target branch""" - task_pointer_file = self.description.task_object.remote_file_path - pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) - task_state_file_path = f"{pull_request_dir}/TaskState" - - try: - # get content from source branch - source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch) - - # get current file in target branch - target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch) - - # update if content is different - if source_content.sha != target_file.sha: - result = self.git_repo.update_file( - path=task_state_file_path, - message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}", - content=source_content.decoded_content, - sha=target_file.sha, - branch=target_branch - ) - log_message(LoggingScope.TASK_OPS, "INFO", "Updated '%s'", task_state_file_path) - return result - else: - log_message(LoggingScope.TASK_OPS, "INFO", "No changes needed for '%s'", task_state_file_path) - return None - - except Exception as err: - log_message(LoggingScope.TASK_OPS, "ERROR", "Error syncing task state file: '%s'", err) - return None - @log_function_entry_exit() def _update_task_states(self, next_state: EESSITaskState, default_branch_name: str, approved_state: EESSITaskState, feature_branch_name: str): From ea89ee0e9e01b86eb0a9c94eb58bab4a52c081ef Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 29 Jun 2025 21:01:22 +0200 Subject: [PATCH 21/26] enable task handling --- scripts/automated_ingestion/ingest_bundles.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py index b4df5f2f..363d7060 100644 --- a/scripts/automated_ingestion/ingest_bundles.py +++ b/scripts/automated_ingestion/ingest_bundles.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from eessi_data_object import EESSIDataAndSignatureObject -from eessi_task import EESSITask +from eessi_task import EESSITask, EESSITaskState from eessi_task_description import EESSITaskDescription from eessi_s3_bucket import EESSIS3Bucket from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes @@ -184,22 +184,22 @@ def main(): log_message(LoggingScope.GROUP_OPS, "INFO", "Created EESSITask: '%s'", task) -# previous_state = None -# current_state = task.determine_state() -# log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'", -# task_path, current_state.name) -# while (current_state is not None and -# current_state != TaskState.DONE and -# previous_state != current_state): -# previous_state = current_state -# log_message(LoggingScope.GROUP_OPS, "INFO", -# "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'", -# task_path, previous_state.name, current_state.name) -# current_state = task.handle() -# log_message(LoggingScope.GROUP_OPS, "INFO", -# "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'", -# task_path, previous_state.name, current_state.name) -# + previous_state = None + current_state = task.determine_state() + log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'", + task_path, current_state.name) + while (current_state is not None and + current_state != EESSITaskState.DONE and + previous_state != current_state): + previous_state = current_state + log_message(LoggingScope.GROUP_OPS, "INFO", + "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'", + task_path, previous_state.name, current_state.name) + current_state = task.handle() + log_message(LoggingScope.GROUP_OPS, "INFO", + "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'", + task_path, previous_state.name, current_state.name) + except Exception as err: log_message(LoggingScope.ERROR, "ERROR", "Failed to process task '%s': '%s'", task_path, str(err)) continue From 0bb3fc1a1473d22c59c688cc872fd2f8c412d606 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Jul 2025 20:40:32 +0200 Subject: [PATCH 22/26] fix determining next sequence number when last PR was closed --- scripts/automated_ingestion/eessi_task.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 850c426a..34ed2b24 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -557,8 +557,13 @@ def _determine_sequence_number(self) -> int: # the PR is merged, so we use the next sequence number return highest_sequence_number + 1 else: - # the PR is not merged, so we can use the current sequence number - return highest_sequence_number + # the PR is not merged, it may be closed though + if pull_request.state == 'closed': + # PR has been closed, so we return the next sequence number + return highest_sequence_number + 1 + else: + # PR is not closed, so we return the current highest sequence number + return highest_sequence_number @log_function_entry_exit() def _handle_add_undetermined(self): From b1663f1c245285e3e1252eebec8f37069d8f45ab Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 6 Aug 2025 13:54:14 +0200 Subject: [PATCH 23/26] return action ADD if metadata file doesn't define it --- scripts/automated_ingestion/eessi_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 34ed2b24..bd2b2320 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -114,7 +114,9 @@ def _determine_task_action(self) -> EESSITaskAction: return EESSITaskAction.ADD elif action_str == "update": return EESSITaskAction.UPDATE - return EESSITaskAction.UNKNOWN + # temporarily return EESSITaskAction.ADD as default because the metadata + # file does not yet have an action defined yet + return EESSITaskAction.ADD @log_function_entry_exit() def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool: From d7952d8dc006bc6ace0a3d3da85a0cff652e2ce2 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 6 Aug 2025 14:01:57 +0200 Subject: [PATCH 24/26] process reprod dirs when generating tarball overview --- .../automated_ingestion/eessi_task_payload.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index fe0db162..26d56ef9 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -69,25 +69,31 @@ def analyse_contents(self, config: Dict) -> str: swdirs = [ # all directory names with the pattern: /software// member.path for member in members - if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, "software", "*", "*")) + if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'software', '*', '*')) ] modfiles = [ # all filenames with the pattern: /modules///*.lua member.path for member in members if member.isfile() - and PurePosixPath(member.path).match(os.path.join(prefix, "modules", "*", "*", "*.lua")) + and PurePosixPath(member.path).match(os.path.join(prefix, 'modules', '*', '*', '*.lua')) ] - other = [ # anything that is not in /software nor /modules + reprod_dirs = [ + member.path + for member in members + if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'reprod', '*', '*', '*')) + ] + other = [ # anything that is not in /software nor /modules nor /reprod member.path for member in members if ( - not PurePosixPath(prefix).joinpath("software") in PurePosixPath(member.path).parents - and not PurePosixPath(prefix).joinpath("modules") in PurePosixPath(member.path).parents + not PurePosixPath(prefix).joinpath('software') in PurePosixPath(member.path).parents + and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(member.path).parents + and not PurePosixPath(prefix).joinpath('reprod') in PurePosixPath(member.path).parents ) # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*')) # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*')) ] - members_list = sorted(swdirs + modfiles + other) + members_list = sorted(swdirs + modfiles + reprod_dirs + other) # construct the overview overview = config["github"]["task_summary_payload_overview_template"].format( From 80897fc86a231103d4caa448c9c96bc46a4ab7ea Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 6 Aug 2025 14:05:31 +0200 Subject: [PATCH 25/26] remove unused type --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index bd2b2320..86fbd8df 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,6 +1,6 @@ from enum import Enum, auto from functools import total_ordering -from typing import Dict, List, Tuple, Optional, Any +from typing import Dict, List, Optional, Any import base64 import os From 76b3d3434cacce46336bc02df91874bd6aecf88d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 8 Aug 2025 10:12:20 +0200 Subject: [PATCH 26/26] reduce limit to less than 3 --- scripts/automated_ingestion/eessi_task_payload.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 26d56ef9..112fcfd1 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -48,7 +48,11 @@ def analyse_contents(self, config: Dict) -> str: tar_num_members = len(members) paths = sorted([m.path for m in members]) - if tar_num_members < 100: + # reduce limit for full listing from 100 to 3 because the description can + # include 10s of tarballs and thus even 100 maybe too many; using a very + # small number can still be useful if there is only a very small number + # of files, say an architecture specific configuration file + if tar_num_members < 3: tar_members_desc = "Full listing of the contents of the tarball:" members_list = paths