From 9ceaf6eb0007ebf36918dbb0e4055a07486d2444 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 13:19:45 +0200
Subject: [PATCH 01/26] add CI for pytests and code style checks

---
 .flake8                                       | 14 ++++++
 .github/workflows/test-ingest-python-code.yml | 49 +++++++++++++++++++
 scripts/automated_ingestion/pytest.sh         | 10 ++++
 3 files changed, 73 insertions(+)
 create mode 100644 .flake8
 create mode 100644 .github/workflows/test-ingest-python-code.yml
 create mode 100644 scripts/automated_ingestion/pytest.sh

diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..b6b309e3
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,14 @@
+# This file is part of the EESSI filesystem layer,
+# see https://github.com/EESSI/filesystem-layer
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+
+[flake8]
+max-line-length = 120
+
+# ignore "Black would make changes" produced by flake8-black
+# see also https://github.com/houndci/hound/issues/1769
+extend-ignore = BLK100
diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml
new file mode 100644
index 00000000..3ed0e692
--- /dev/null
+++ b/.github/workflows/test-ingest-python-code.yml
@@ -0,0 +1,49 @@
+# This file is part of the EESSI filesystem layer,
+# see https://github.com/EESSI/filesystem-layer
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+
+name: Run tests
+on: [push, pull_request]
+# Declare default permissions as read only.
+permissions: read-all
+jobs:
+  test:
+    runs-on: ubuntu-24.04
+    strategy:
+      matrix:
+        # for now, only test with Python 3.9+ (since we're testing in Ubuntu 24.04)
+        #python: [3.6, 3.7, 3.8, 3.9, '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
+      fail-fast: false
+    steps:
+      - name: checkout
+        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+
+      - name: set up Python
+        uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0
+        with:
+          python-version: ${{matrix.python}}
+
+      - name: Install required Python packages + pytest + flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r requirements.txt
+          python -m pip install pytest
+          python -m pip install --upgrade flake8
+
+      - name: Run test suite (without coverage)
+        run: |
+          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -verbose
+
+      - name: Run test suite (with coverage)
+        run: |
+          python -m pip install pytest-cov
+          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=$PWD
+
+      - name: Run flake8 to verify PEP8-compliance of Python code
+        run: |
+          flake8 scripts/automated_ingestion
\ No newline at end of file
diff --git a/scripts/automated_ingestion/pytest.sh b/scripts/automated_ingestion/pytest.sh
new file mode 100644
index 00000000..f8b4e170
--- /dev/null
+++ b/scripts/automated_ingestion/pytest.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# This file is part of the EESSI filesystem layer,
+# see https://github.com/EESSI/filesystem-layer
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+PYTHONPATH=$PWD:$PYTHONPATH pytest --capture=no "$@"
\ No newline at end of file

From e974676f6d3fbbf5c13c0467b9ab4cb98ba87e6a Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 13:23:36 +0200
Subject: [PATCH 02/26] exclude existing *.py files from flake8 tests

---
 .github/workflows/test-ingest-python-code.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml
index 3ed0e692..da6d28d6 100644
--- a/.github/workflows/test-ingest-python-code.yml
+++ b/.github/workflows/test-ingest-python-code.yml
@@ -46,4 +46,4 @@ jobs:
 
       - name: Run flake8 to verify PEP8-compliance of Python code
         run: |
-          flake8 scripts/automated_ingestion
\ No newline at end of file
+          flake8 scripts/automated_ingestion --exclude=scripts/automated_ingestion/automated_ingestion.py,scripts/automated_ingestion/eessitarball.py
\ No newline at end of file

From c213d704c67bb117e6b5f9bd4c533ca0bb5a4e55 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 13:24:23 +0200
Subject: [PATCH 03/26] change permission for pytest.sh script

---
 scripts/automated_ingestion/pytest.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/automated_ingestion/pytest.sh

diff --git a/scripts/automated_ingestion/pytest.sh b/scripts/automated_ingestion/pytest.sh
old mode 100644
new mode 100755

From cfce28d39b8d0f8ef64dfcddd7239283de35a5b4 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 13:26:31 +0200
Subject: [PATCH 04/26] logging functions in separate module

---
 scripts/automated_ingestion/eessi_logging.py | 246 +++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 scripts/automated_ingestion/eessi_logging.py

diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py
new file mode 100644
index 00000000..ab5947c8
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_logging.py
@@ -0,0 +1,246 @@
+import functools
+import inspect
+import logging
+import os
+import sys
+import time
+
+from enum import IntFlag, auto
+from typing import Callable, Union
+
+
+class LoggingScope(IntFlag):
+    """Enumeration of different logging scopes."""
+    NONE = 0
+    FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
+    DOWNLOAD = auto()         # Logging related to file downloads
+    VERIFICATION = auto()     # Logging related to signature and checksum verification
+    STATE_OPS = auto()        # Logging related to tarball state operations
+    GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
+    GROUP_OPS = auto()        # Logging related to tarball group operations
+    TASK_OPS = auto()         # Logging related to task operations
+    ERROR = auto()            # Error logging (separate from other scopes for easier filtering)
+    DEBUG = auto()            # Debug-level logging (separate from other scopes for easier filtering)
+    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS |
+           GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG)
+
+
+# Global setting for logging scopes
+ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+
+
+# Global variable to track call stack depth
+_call_stack_depth = 0
+
+
+def is_logging_scope_enabled(scope: LoggingScope) -> bool:
+    """Check if a specific logging scope is enabled."""
+    return bool(ENABLED_LOGGING_SCOPES & scope)
+
+
+def log_function_entry_exit(logger: logging.Logger = None) -> Callable:
+    """
+    Decorator that logs function entry and exit with timing information.
+    Only logs if the FUNC_ENTRY_EXIT scope is enabled.
+
+    Args:
+        logger: Optional logger instance. If not provided, uses the module's logger.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            global _call_stack_depth
+
+            if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT):
+                return func(*args, **kwargs)
+
+            if logger is None:
+                log = logging.getLogger(func.__module__)
+            else:
+                log = logger
+
+            # Get context information if available
+            context = ""
+            if len(args) > 0 and hasattr(args[0], 'object'):
+                # For EessiTarball methods, show the tarball name and state
+                tarball = args[0]
+                filename = os.path.basename(tarball.object)
+
+                # Format filename to show important parts
+                if len(filename) > 30:
+                    parts = filename.split('-')
+                    if len(parts) >= 6:  # Ensure we have all required parts
+                        # Get version, component, last part of architecture, and epoch
+                        version = parts[1]
+                        component = parts[2]
+                        arch_last = parts[-2].split('-')[-1]  # Last part of architecture
+                        epoch = parts[-1]  # includes file extension
+                        filename = f"{version}-{component}-{arch_last}-{epoch}"
+                    else:
+                        # Fallback to simple truncation if format doesn't match
+                        filename = f"{filename[:15]}...{filename[-12:]}"
+
+                context = f" [{filename}"
+                if hasattr(tarball, 'state'):
+                    context += f" in {tarball.state}"
+                context += "]"
+
+            # Create indentation based on call stack depth
+            indent = "  " * _call_stack_depth
+
+            # Get file name and line number where the function is defined
+            file_name = os.path.basename(inspect.getsourcefile(func))
+            source_lines, start_line = inspect.getsourcelines(func)
+            # Find the line with the actual function definition
+            def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def '))
+            def_line_no = start_line + def_line
+            # Find the last non-empty line of the function
+            last_line = next(i for i, line in enumerate(reversed(source_lines)) if line.strip())
+            last_line_no = start_line + len(source_lines) - 1 - last_line
+
+            start_time = time.time()
+            log.info(f"{indent}[FUNC_ENTRY_EXIT] Entering {func.__name__} at {file_name}:{def_line_no}{context}")
+            _call_stack_depth += 1
+            try:
+                result = func(*args, **kwargs)
+                _call_stack_depth -= 1
+                end_time = time.time()
+                # For normal returns, show the last line of the function
+                log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}"
+                         f"{context} (took {end_time - start_time:.2f}s)")
+                return result
+            except Exception as err:
+                _call_stack_depth -= 1
+                end_time = time.time()
+                # For exceptions, try to get the line number from the exception
+                try:
+                    exc_line_no = err.__traceback__.tb_lineno
+                except AttributeError:
+                    exc_line_no = last_line_no
+                log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}"
+                         f"{context} with exception (took {end_time - start_time:.2f}s)")
+                raise err
+        return wrapper
+    return decorator
+
+
+def log_message(scope, level, msg, *args, logger=None, **kwargs):
+    """
+    Log a message if either:
+    1. The specified scope is enabled, OR
+    2. The current log level is equal to or higher than the specified level
+
+    Args:
+        scope: LoggingScope value indicating which scope this logging belongs to
+        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        msg: Message to log
+        logger: Optional logger instance. If not provided, uses the root logger.
+        *args, **kwargs: Additional arguments to pass to the logging function
+    """
+    log = logger or logging.getLogger()
+    log_level = getattr(logging, level.upper())
+
+    # Check if either condition is met
+    if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()):
+        return
+
+    # Create indentation based on call stack depth
+    indent = "  " * _call_stack_depth
+    # Add scope to the message
+    scoped_msg = f"[{scope.name}] {msg}"
+    indented_msg = f"{indent}{scoped_msg}"
+
+    # If scope is enabled, use the temporary handler
+    if is_logging_scope_enabled(scope):
+        # Save original handlers
+        original_handlers = list(log.handlers)
+
+        # Create a temporary handler that accepts all levels
+        temp_handler = logging.StreamHandler(sys.stdout)
+        temp_handler.setLevel(logging.DEBUG)
+        temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
+
+        try:
+            # Remove existing handlers temporarily
+            for handler in original_handlers:
+                log.removeHandler(handler)
+
+            # Add temporary handler
+            log.addHandler(temp_handler)
+
+            # Log the message
+            log_func = getattr(log, level.lower())
+            log_func(indented_msg, *args, **kwargs)
+        finally:
+            log.removeHandler(temp_handler)
+            # Restore original handlers
+            for handler in original_handlers:
+                if handler not in log.handlers:
+                    log.addHandler(handler)
+    # Only use normal logging if scope is not enabled AND level is high enough
+    elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel():
+        # Use normal logging with level check
+        log_func = getattr(log, level.lower())
+        log_func(indented_msg, *args, **kwargs)
+
+
+def set_logging_scopes(scopes: Union[LoggingScope, str, list[str]]) -> None:
+    """
+    Set the enabled logging scopes.
+
+    Args:
+        scopes: Can be
+            - A LoggingScope value
+            - A string with comma-separated values using +/- syntax:
+              - "+SCOPE" to enable a scope
+              - "-SCOPE" to disable a scope
+              - "ALL" or "+ALL" to enable all scopes
+              - "-ALL" to disable all scopes
+              Examples:
+                "+FUNC_ENTRY_EXIT"  # Enable only function entry/exit
+                "+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE"  # Enable function entry/exit but disable example
+                "+ALL,-FUNC_ENTRY_EXIT"  # Enable all scopes except function entry/exit
+    """
+    global ENABLED_LOGGING_SCOPES
+
+    if isinstance(scopes, LoggingScope):
+        ENABLED_LOGGING_SCOPES = scopes
+        return
+
+    if isinstance(scopes, str):
+        # Start with no scopes enabled
+        ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+
+        # Split into individual scope specifications
+        scope_specs = [s.strip() for s in scopes.split(",")]
+
+        for spec in scope_specs:
+            if not spec:
+                continue
+
+            # Check for ALL special case
+            if spec.upper() in ["ALL", "+ALL"]:
+                ENABLED_LOGGING_SCOPES = LoggingScope.ALL
+                continue
+            elif spec.upper() == "-ALL":
+                ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+                continue
+
+            # Parse scope name and operation
+            operation = spec[0]
+            scope_name = spec[1:].strip().upper()
+
+            try:
+                scope_enum = LoggingScope[scope_name]
+                if operation == '+':
+                    ENABLED_LOGGING_SCOPES |= scope_enum
+                elif operation == '-':
+                    ENABLED_LOGGING_SCOPES &= ~scope_enum
+                else:
+                    logging.warning(f"Invalid operation '{operation}' in scope specification: {spec}")
+            except KeyError:
+                logging.warning(f"Unknown logging scope: {scope_name}")
+
+    elif isinstance(scopes, list):
+        # Convert list to comma-separated string and process
+        set_logging_scopes(",".join(scopes))

From 1d2c17f3db773c75789d91b1b091635763a9fda4 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 13:53:17 +0200
Subject: [PATCH 05/26] add relative path to requirements.txt

---
 .github/workflows/test-ingest-python-code.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml
index da6d28d6..c0fc979c 100644
--- a/.github/workflows/test-ingest-python-code.yml
+++ b/.github/workflows/test-ingest-python-code.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Install required Python packages + pytest + flake8
         run: |
           python -m pip install --upgrade pip
-          python -m pip install -r requirements.txt
+          python -m pip install -r scripts/automated_ingestion/requirements.txt
           python -m pip install pytest
           python -m pip install --upgrade flake8
 

From 8d0bb3527908d1703d97a124476a6fc9abcb46c3 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 14:19:20 +0200
Subject: [PATCH 06/26] fix test issues and limit coverage to new python code

---
 .github/workflows/test-ingest-python-code.yml | 4 ++--
 .gitignore                                    | 1 +
 scripts/automated_ingestion/.coveragerc       | 5 +++++
 3 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 scripts/automated_ingestion/.coveragerc

diff --git a/.github/workflows/test-ingest-python-code.yml b/.github/workflows/test-ingest-python-code.yml
index c0fc979c..9e341783 100644
--- a/.github/workflows/test-ingest-python-code.yml
+++ b/.github/workflows/test-ingest-python-code.yml
@@ -37,12 +37,12 @@ jobs:
 
       - name: Run test suite (without coverage)
         run: |
-          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -verbose
+          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion --verbose
 
       - name: Run test suite (with coverage)
         run: |
           python -m pip install pytest-cov
-          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=$PWD
+          ./scripts/automated_ingestion/pytest.sh scripts/automated_ingestion -q --cov=scripts/automated_ingestion/eessi_logging.py
 
       - name: Run flake8 to verify PEP8-compliance of Python code
         run: |
diff --git a/.gitignore b/.gitignore
index 39af2bac..7789e614 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 build
 hosts
+.coverage
diff --git a/scripts/automated_ingestion/.coveragerc b/scripts/automated_ingestion/.coveragerc
new file mode 100644
index 00000000..ec1f100a
--- /dev/null
+++ b/scripts/automated_ingestion/.coveragerc
@@ -0,0 +1,5 @@
+[run]
+omit =
+    scripts/automated_ingestion/automated_ingestion.py
+    scripts/automated_ingestion/eessitarball.py
+    scripts/automated_ingestion/utils.py

From 0ede3a8d2a8d5918a9eb69542bf44068847e3ca3 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 15:28:08 +0200
Subject: [PATCH 07/26] add basic tests to avoid coverage errors

---
 .gitignore                                    |  1 +
 .../unit_tests/__init__.py                    |  1 +
 .../unit_tests/test_basic.py                  | 27 +++++++++++++++++++
 3 files changed, 29 insertions(+)
 create mode 100644 scripts/automated_ingestion/unit_tests/__init__.py
 create mode 100644 scripts/automated_ingestion/unit_tests/test_basic.py

diff --git a/.gitignore b/.gitignore
index 7789e614..893c00e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 build
 hosts
 .coverage
+**/__pycache__
diff --git a/scripts/automated_ingestion/unit_tests/__init__.py b/scripts/automated_ingestion/unit_tests/__init__.py
new file mode 100644
index 00000000..467d5dfe
--- /dev/null
+++ b/scripts/automated_ingestion/unit_tests/__init__.py
@@ -0,0 +1 @@
+# This file makes the unit_tests directory a Python package
diff --git a/scripts/automated_ingestion/unit_tests/test_basic.py b/scripts/automated_ingestion/unit_tests/test_basic.py
new file mode 100644
index 00000000..7e382cbd
--- /dev/null
+++ b/scripts/automated_ingestion/unit_tests/test_basic.py
@@ -0,0 +1,27 @@
+"""
+Basic test file to prevent pytest from failing with exit code 5 when no tests are found.
+
+This file is part of the EESSI filesystem layer,
+see https://github.com/EESSI/filesystem-layer
+
+author: Thomas Roeblitz (@trz42)
+
+license: GPLv2
+"""
+
+import pytest
+
+
+def test_basic_placeholder():
+    """Basic placeholder test that always passes."""
+    assert True
+
+
+def test_import_modules():
+    """Test that we can import the main modules without errors."""
+    try:
+        import eessi_logging
+        # Verify the modules were imported successfully
+        assert eessi_logging is not None
+    except ImportError as err:
+        pytest.skip(f"Module import failed: {err}")

From 712c48c1c3cb043b6026a5ac7d8a11cada0da964 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 15:30:26 +0200
Subject: [PATCH 08/26] skip flake8 for existing files and unit tests

---
 .flake8 | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index b6b309e3..15beaa74 100644
--- a/.flake8
+++ b/.flake8
@@ -7,8 +7,15 @@
 #
 
 [flake8]
-max-line-length = 120
+exclude =
+    scripts/check-stratum-servers.py,
+    scripts/automated_ingestion/automated_ingestion.py,
+    scripts/automated_ingestion/eessitarball.py,
+    scripts/automated_ingestion/utils.py,
+    scripts/automated_ingestion/unit_tests/*.py
 
 # ignore "Black would make changes" produced by flake8-black
 # see also https://github.com/houndci/hound/issues/1769
 extend-ignore = BLK100
+
+max-line-length = 120

From 7c7d673e571382b162e32a55bb5d44c5009aa4a5 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 15:30:49 +0200
Subject: [PATCH 09/26] skip coverage for unit tests

---
 scripts/automated_ingestion/.coveragerc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/.coveragerc b/scripts/automated_ingestion/.coveragerc
index ec1f100a..2941a1ed 100644
--- a/scripts/automated_ingestion/.coveragerc
+++ b/scripts/automated_ingestion/.coveragerc
@@ -3,3 +3,4 @@ omit =
     scripts/automated_ingestion/automated_ingestion.py
     scripts/automated_ingestion/eessitarball.py
     scripts/automated_ingestion/utils.py
+    scripts/automated_ingestion/unit_tests/*.py

From eea35ae74e7fbdedd1c9d1c6428ae0f37269679d Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 15:42:35 +0200
Subject: [PATCH 10/26] include unit tests in flake8 run

---
 .flake8 | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.flake8 b/.flake8
index 15beaa74..852c375b 100644
--- a/.flake8
+++ b/.flake8
@@ -11,8 +11,7 @@ exclude =
     scripts/check-stratum-servers.py,
     scripts/automated_ingestion/automated_ingestion.py,
     scripts/automated_ingestion/eessitarball.py,
-    scripts/automated_ingestion/utils.py,
-    scripts/automated_ingestion/unit_tests/*.py
+    scripts/automated_ingestion/utils.py
 
 # ignore "Black would make changes" produced by flake8-black
 # see also https://github.com/houndci/hound/issues/1769

From 34180df14371e98132e5302d17fc6c2440d7e9eb Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 15:53:49 +0200
Subject: [PATCH 11/26] move LOG_LEVELS and error func to logging module

---
 scripts/automated_ingestion/eessi_logging.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py
index ab5947c8..857d92ca 100644
--- a/scripts/automated_ingestion/eessi_logging.py
+++ b/scripts/automated_ingestion/eessi_logging.py
@@ -9,6 +9,15 @@
 from typing import Callable, Union
 
 
+LOG_LEVELS = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+
+
 class LoggingScope(IntFlag):
     """Enumeration of different logging scopes."""
     NONE = 0
@@ -33,6 +42,12 @@ class LoggingScope(IntFlag):
 _call_stack_depth = 0
 
 
+def error(msg, code=1):
+    """Print an error and exit."""
+    log_message(LoggingScope.ERROR, 'ERROR', msg)
+    sys.exit(code)
+
+
 def is_logging_scope_enabled(scope: LoggingScope) -> bool:
     """Check if a specific logging scope is enabled."""
     return bool(ENABLED_LOGGING_SCOPES & scope)

From 397eca8a1fc9a67526c5b8ebda2fc6215d0ef0f3 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 16:53:49 +0200
Subject: [PATCH 12/26] use shared pid lock file to ensure at most one ingest
 is active

---
 scripts/automated_ingestion/automated_ingestion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 92dac552..7a7f9dc8 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -81,7 +81,7 @@ def parse_args():
     return args
 
 
-@pid.decorator.pidfile('automated_ingestion.pid')
+@pidfile('shared_lock.pid')  # noqa: F401
 def main():
     """Main function."""
     args = parse_args()

From a508c9e15987c00767d632a91a20573ee16f1507 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 16:55:17 +0200
Subject: [PATCH 13/26] add main script and modules to list task files

---
 .../eessi_remote_storage_client.py            |  34 +++
 .../automated_ingestion/eessi_s3_bucket.py    | 191 +++++++++++++
 scripts/automated_ingestion/ingest_bundles.py | 264 ++++++++++++++++++
 3 files changed, 489 insertions(+)
 create mode 100644 scripts/automated_ingestion/eessi_remote_storage_client.py
 create mode 100644 scripts/automated_ingestion/eessi_s3_bucket.py
 create mode 100644 scripts/automated_ingestion/ingest_bundles.py

diff --git a/scripts/automated_ingestion/eessi_remote_storage_client.py b/scripts/automated_ingestion/eessi_remote_storage_client.py
new file mode 100644
index 00000000..9f83d721
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_remote_storage_client.py
@@ -0,0 +1,34 @@
+from enum import Enum
+from typing import Protocol, runtime_checkable
+
+
+class DownloadMode(Enum):
+    """Enum defining different modes for downloading files."""
+    FORCE = 'force'  # Always download and overwrite
+    CHECK_REMOTE = 'check-remote'  # Download if remote files have changed
+    CHECK_LOCAL = 'check-local'  # Download if files don't exist locally (default)
+
+
+@runtime_checkable
+class EESSIRemoteStorageClient(Protocol):
+    """Protocol defining the interface for remote storage clients."""
+
+    def get_metadata(self, remote_path: str) -> dict:
+        """Get metadata about a remote object.
+
+        Args:
+            remote_path: Path to the object in remote storage
+
+        Returns:
+            Dictionary containing object metadata, including 'ETag' key
+        """
+        ...
+
+    def download(self, remote_path: str, local_path: str) -> None:
+        """Download a remote file to a local location.
+
+        Args:
+            remote_path: Path to the object in remote storage
+            local_path: Local path where to save the file
+        """
+        ...
diff --git a/scripts/automated_ingestion/eessi_s3_bucket.py b/scripts/automated_ingestion/eessi_s3_bucket.py
new file mode 100644
index 00000000..bc5a8822
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_s3_bucket.py
@@ -0,0 +1,191 @@
+import os
+from pathlib import Path
+from typing import Dict, Optional
+
+import boto3
+from botocore.exceptions import ClientError
+from eessi_logging import log_function_entry_exit, log_message, LoggingScope
+from eessi_remote_storage_client import EESSIRemoteStorageClient
+
+
+class EESSIS3Bucket(EESSIRemoteStorageClient):
+    """EESSI-specific S3 bucket implementation of the EESSIRemoteStorageClient protocol."""
+
+    @log_function_entry_exit()
+    def __init__(self, config, bucket_name: str):
+        """
+        Initialize the EESSI S3 bucket.
+
+        Args:
+            config: Configuration object containing:
+                   - aws.access_key_id: AWS access key ID (optional, can use AWS_ACCESS_KEY_ID env var)
+                   - aws.secret_access_key: AWS secret access key (optional, can use AWS_SECRET_ACCESS_KEY env var)
+                   - aws.endpoint_url: Custom endpoint URL for S3-compatible backends (optional)
+                   - aws.verify: SSL verification setting (optional)
+                         - True: Verify SSL certificates (default)
+                         - False: Skip SSL certificate verification
+                         - str: Path to CA bundle file
+            bucket_name: Name of the S3 bucket to use
+        """
+        self.bucket = bucket_name
+
+        # get AWS credentials from environment or config
+        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") or config.get("secrets", "aws_access_key_id")
+        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") or config.get("secrets", "aws_secret_access_key")
+
+        # configure boto3 client
+        client_config = {}
+
+        # add endpoint URL if specified in config
+        if config.has_option("aws", "endpoint_url"):
+            client_config["endpoint_url"] = config["aws"]["endpoint_url"]
+            log_message(LoggingScope.DEBUG, "DEBUG", "Using custom endpoint URL: '%s'", client_config["endpoint_url"])
+
+        # add SSL verification if specified in config
+        if config.has_option("aws", "verify"):
+            verify = config["aws"]["verify"]
+            if verify.lower() == "false":
+                client_config["verify"] = False
+                log_message(LoggingScope.DEBUG, "WARNING", "SSL verification disabled")
+            elif verify.lower() == "true":
+                client_config["verify"] = True
+            else:
+                client_config["verify"] = verify  # assume it's a path to CA bundle
+                log_message(LoggingScope.DEBUG, "DEBUG", "Using custom CA bundle: '%s'", verify)
+
+        self.client = boto3.client(
+            "s3",
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            **client_config
+        )
+        log_message(LoggingScope.DEBUG, "INFO", "Initialized S3 client for bucket: '%s'", self.bucket)
+
+    @log_function_entry_exit()
+    def download(self, remote_path: str, local_path: str) -> None:
+        """
+        Download an S3 object to a local location and store its ETag.
+
+        Args:
+            remote_path: Path to the object in S3
+            local_path: Local path where to save the file
+        """
+        try:
+            log_message(LoggingScope.DOWNLOAD, "INFO", "Downloading '%s' to '%s'", remote_path, local_path)
+            self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path)
+            log_message(LoggingScope.DOWNLOAD, "INFO", "Successfully downloaded '%s' to '%s'", remote_path, local_path)
+        except ClientError as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to download '%s': '%s'", remote_path, str(err))
+            raise
+
+        # get metadata first to obtain the ETag
+        metadata = self.get_metadata(remote_path)
+        etag = metadata["ETag"]
+
+        # store the ETag
+        self._write_etag(local_path, etag)
+
+    @log_function_entry_exit()
+    def download_file(self, key: str, filename: str) -> None:
+        """
+        Download a file from S3 to a local file.
+
+        Args:
+            key: The S3 key of the file to download
+            filename: The local path where the file should be saved
+        """
+        self.client.download_file(self.bucket, key, filename)
+
+    @log_function_entry_exit()
+    def get_bucket_url(self) -> str:
+        """
+        Get the HTTPS URL for a bucket from an initialized boto3 client.
+        Works with both AWS S3 and MinIO/S3-compatible services.
+        """
+        try:
+            # check if this is a custom endpoint (MinIO) or AWS S3
+            endpoint_url = self.client.meta.endpoint_url
+
+            if endpoint_url:
+                # custom endpoint (MinIO, DigitalOcean Spaces, etc.)
+                # most S3-compatible services use path-style URLs
+                bucket_url = f"{endpoint_url}/{self.bucket}"
+            else:
+                # AWS S3 (no custom endpoint specified)
+                region = self.client.meta.region_name or 'us-east-1'
+
+                # AWS S3 virtual-hosted-style URLs
+                if region == "us-east-1":
+                    bucket_url = f"https://{self.bucket}.s3.amazonaws.com"
+                else:
+                    bucket_url = f"https://{self.bucket}.s3.{region}.amazonaws.com"
+
+            return bucket_url
+
+        except Exception as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Error getting bucket URL: '%s'", str(err))
+            return None
+
+    @log_function_entry_exit()
+    def get_metadata(self, remote_path: str) -> Dict:
+        """
+        Get metadata about an S3 object.
+
+        Args:
+            remote_path: Path to the object in S3
+
+        Returns:
+            Dictionary containing object metadata, including 'ETag' key
+        """
+        try:
+            log_message(LoggingScope.DEBUG, "DEBUG", "Getting metadata for S3 object: '%s'", remote_path)
+            response = self.client.head_object(Bucket=self.bucket, Key=remote_path)
+            log_message(LoggingScope.DEBUG, "DEBUG", "Retrieved metadata for '%s': '%s'", remote_path, response)
+            return response
+        except ClientError as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to get metadata for '%s': '%s'", remote_path, str(err))
+            raise
+
+    @log_function_entry_exit()
+    def _get_etag_file_path(self, local_path: str) -> Path:
+        """Get the path to the .etag file for a given local file."""
+        return Path(local_path).with_suffix(".etag")
+
+    @log_function_entry_exit()
+    def list_objects_v2(self, **kwargs):
+        """
+        List objects in the bucket using the underlying boto3 client.
+
+        Args:
+            **kwargs: Additional arguments to pass to boto3.client.list_objects_v2
+
+        Returns:
+            Response from boto3.client.list_objects_v2
+        """
+        return self.client.list_objects_v2(Bucket=self.bucket, **kwargs)
+
+    @log_function_entry_exit()
+    def _read_etag(self, local_path: str) -> Optional[str]:
+        """Read the ETag from the .etag file if it exists."""
+        etag_path = self._get_etag_file_path(local_path)
+        if etag_path.exists():
+            try:
+                with open(etag_path, "r") as f:
+                    return f.read().strip()
+            except Exception as e:
+                log_message(LoggingScope.DEBUG, "WARNING", "Failed to read ETag file '%s': '%s'", etag_path, str(e))
+                return None
+        return None
+
+    @log_function_entry_exit()
+    def _write_etag(self, local_path: str, etag: str) -> None:
+        """Write the ETag to the .etag file."""
+        etag_path = self._get_etag_file_path(local_path)
+        try:
+            with open(etag_path, "w") as f:
+                f.write(etag)
+            log_message(LoggingScope.DEBUG, "DEBUG", "Wrote ETag to '%s'", etag_path)
+        except Exception as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to write ETag file '%s': '%s'", etag_path, str(err))
+            # if we can't write the etag file, it's not critical
+            # the file will just be downloaded again next time
diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
new file mode 100644
index 00000000..f8131213
--- /dev/null
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+
+# from eessi_data_object import EESSIDataAndSignatureObject
+# from eessi_task import EESSITask, TaskState
+# from eessi_task_description import EESSITaskDescription
+from eessi_s3_bucket import EESSIS3Bucket
+from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes
+from pid.decorator import pidfile  # noqa: F401
+from pid import PidFileError
+
+import argparse
+import configparser
+# import github
+import json
+import logging
+# import os
+import sys
+from pathlib import Path
+from typing import List
+
+REQUIRED_CONFIG = {
+    "secrets": ["aws_secret_access_key", "aws_access_key_id", "github_pat"],
+    "paths": ["download_dir", "ingestion_script", "metadata_file_extension"],
+    "aws": ["staging_buckets"],
+    "github": ["staging_repo", "failed_ingestion_issue_body", "pr_body"],
+}
+
+
+@log_function_entry_exit()
+def parse_config(path):
+    """Parse the configuration file."""
+    config = configparser.ConfigParser()
+    try:
+        config.read(path)
+    except Exception as err:
+        error(f"Unable to read configuration file '{path}'!\nException: '{err}'")
+
+    # check if all required configuration parameters/sections can be found
+    for section in REQUIRED_CONFIG.keys():
+        if section not in config:
+            error(f"Missing section '{section}' in configuration file '{path}'.")
+        for item in REQUIRED_CONFIG[section]:
+            if item not in config[section]:
+                error(f"Missing configuration item '{item}' in section '{section}' of configuration file '{path}'.")
+
+    return config
+
+
+@log_function_entry_exit()
+def parse_args():
+    """Parse the command-line arguments."""
+    parser = argparse.ArgumentParser()
+
+    # logging options
+    logging_group = parser.add_argument_group("Logging options")
+    logging_group.add_argument("--log-file",
+                               help="Path to log file (overrides config file setting)")
+    logging_group.add_argument("--console-level",
+                               choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+                               help="Logging level for console output (overrides config file setting)")
+    logging_group.add_argument("--file-level",
+                               choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+                               help="Logging level for file output (overrides config file setting)")
+    logging_group.add_argument("--quiet",
+                               action="store_true",
+                               help="Suppress console output (overrides all other console settings)")
+    logging_group.add_argument("--log-scopes",
+                               help="Comma-separated list of logging scopes using +/- syntax. "
+                               "Examples: '+FUNC_ENTRY_EXIT' (enable only function entry/exit), "
+                               "'+ALL,-FUNC_ENTRY_EXIT' (enable all except function entry/exit), "
+                               "'+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE' (enable function entry/exit but disable example)")
+
+    # existing arguments
+    parser.add_argument("-c", "--config", type=str, help="path to configuration file",
+                        default="ingest_bundles.cfg", dest="config")
+    parser.add_argument("-d", "--debug", help="enable debug mode", action="store_true", dest="debug")
+    parser.add_argument("-l", "--list", help="only list available tasks", action="store_true", dest="list_only")
+    parser.add_argument("--extensions", help="comma-separated list of extensions to process (default: .task)",
+                        nargs="?", const=".task", default=False)
+
+    return parser.parse_args()
+
+
+@log_function_entry_exit()
+def setup_logging(config: configparser.ConfigParser, args: argparse.Namespace) -> logging.Logger:
+    """
+    Configure logging based on configuration file and command line arguments.
+    Command line arguments take precedence over config file settings.
+
+    Args:
+        config: Configuration parser
+        args: Parsed command line arguments
+
+    Returns:
+        Logger instance
+    """
+    # get settings from config file
+    log_file = config["logging"].get("log_file")
+    config_console_level = LOG_LEVELS.get(config["logging"].get("console_level", "INFO").upper(), logging.INFO)
+    config_file_level = LOG_LEVELS.get(config["logging"].get("file_level", "DEBUG").upper(), logging.DEBUG)
+
+    # override with command line arguments if provided
+    log_file = args.log_file if args.log_file else log_file
+    console_level = getattr(logging, args.console_level) if args.console_level else config_console_level
+    file_level = getattr(logging, args.file_level) if args.file_level else config_file_level
+
+    # debug mode overrides console level
+    if args.debug:
+        console_level = logging.DEBUG
+
+    # set up logging scopes
+    if args.log_scopes:
+        set_logging_scopes(args.log_scopes)
+        log_message(LoggingScope.DEBUG, "DEBUG", "Enabled logging scopes: '%s'", args.log_scopes)
+
+    # create logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)  # set root logger to lowest level
+
+    # create formatters
+    console_formatter = logging.Formatter("%(levelname)-8s: %(message)s")
+    file_formatter = logging.Formatter("%(asctime)s - %(levelname)-8s: %(message)s")
+
+    # console handler (only if not quiet)
+    if not args.quiet:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(console_level)
+        console_handler.setFormatter(console_formatter)
+        logger.addHandler(console_handler)
+
+    # file handler (if log file is specified)
+    if log_file:
+        # ensure log directory exists
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(file_level)
+        file_handler.setFormatter(file_formatter)
+        logger.addHandler(file_handler)
+
+    return logger
+
+
+@pidfile("shared_lock.pid")  # noqa: F401
+@log_function_entry_exit()
+def main():
+    """Main function."""
+    args = parse_args()
+    config = parse_config(args.config)
+    _ = setup_logging(config, args)  # noqa: F841
+
+    # TODO: check configuration: secrets, paths, permissions on dirs, etc
+    extensions = args.extensions.split(",")
+    # gh_pat = config["secrets"]["github_pat"]
+    # gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"])
+
+    buckets = json.loads(config["aws"]["staging_buckets"])
+    for bucket, cvmfs_repo in buckets.items():
+        # create our custom S3 bucket for this bucket
+        s3_bucket = EESSIS3Bucket(config, bucket)
+
+        tasks = find_deployment_tasks(s3_bucket, extensions)
+        if args.list_only:
+            log_message(LoggingScope.GROUP_OPS, "INFO", "#tasks: %d", len(tasks))
+            for num, task in enumerate(tasks):
+                log_message(LoggingScope.GROUP_OPS, "INFO", "[%s] %d: '%s'", bucket, num, task)
+        else:
+            # process each task file
+            for task_path in tasks:
+                log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path)
+
+#                try:
+#                    # Create EESSITask for the task file
+#                    try:
+#                        task = EESSITask(
+#                            EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)),
+#                            config, cvmfs_repo, gh_staging_repo
+#                        )
+#
+#                    except Exception as err:
+#                        log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task %s: %s",
+#                                    task_path, str(err))
+#                        continue
+#
+#                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task)
+#
+#                    previous_state = None
+#                    current_state = task.determine_state()
+#                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'",
+#                                task_path, current_state.name)
+#                    while (current_state is not None and
+#                            current_state != TaskState.DONE and
+#                            previous_state != current_state):
+#                        previous_state = current_state
+#                        log_message(LoggingScope.GROUP_OPS, "INFO",
+#                                    "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'",
+#                                    task_path, previous_state.name, current_state.name)
+#                        current_state = task.handle()
+#                        log_message(LoggingScope.GROUP_OPS, "INFO",
+#                                    "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'",
+#                                    task_path, previous_state.name, current_state.name)
+#
+#                except Exception as err:
+#                    log_message(LoggingScope.ERROR, "ERROR", "Failed to process task %s: %s", task_path, str(err))
+#                    continue
+
+
+@log_function_entry_exit()
+def find_deployment_tasks(s3_bucket: EESSIS3Bucket, extensions: List[str] = None) -> List[str]:
+    """
+    Return a list of all task files in an S3 bucket with the given extensions,
+    but only if a corresponding payload file exists (same name without extension).
+
+    Args:
+        s3_bucket: EESSIS3Bucket instance
+        extensions: List of file extensions to look for (default: ['.task'])
+
+    Returns:
+        List of task filenames found in the bucket that have a corresponding payload
+    """
+    if extensions is None:
+        extensions = [".task"]
+
+    files = []
+    continuation_token = None
+
+    while True:
+        # list objects with pagination
+        if continuation_token:
+            response = s3_bucket.list_objects_v2(
+                ContinuationToken=continuation_token
+            )
+        else:
+            response = s3_bucket.list_objects_v2()
+
+        # add files from this page
+        files.extend([obj["Key"] for obj in response.get("Contents", [])])
+
+        # check if there are more pages
+        if response.get("IsTruncated"):
+            continuation_token = response.get("NextContinuationToken")
+        else:
+            break
+
+    # create a set of all files for faster lookup
+    file_set = set(files)
+
+    # return only task files that have a corresponding payload
+    result = []
+    for file in files:
+        for ext in extensions:
+            if file.endswith(ext) and file[:-len(ext)] in file_set:
+                result.append(file)
+                break  # found a matching extension, no need to check other extensions
+
+    return result
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except PidFileError as err:
+        error(f"Another instance of this script is already running! Error: '{err}'")

From 9847d866c147f68db316fc24271731bdfb5045e0 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 17:24:21 +0200
Subject: [PATCH 14/26] add class to model a (remote) file and its signature

---
 .../automated_ingestion/eessi_data_object.py  | 344 ++++++++++++++++++
 scripts/automated_ingestion/ingest_bundles.py |  13 +-
 2 files changed, 351 insertions(+), 6 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_data_object.py

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
new file mode 100644
index 00000000..4989f24c
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -0,0 +1,344 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import configparser
+import subprocess
+
+from eessi_logging import log_function_entry_exit, log_message, LoggingScope
+from eessi_remote_storage_client import DownloadMode, EESSIRemoteStorageClient
+
+
+@dataclass
+class EESSIDataAndSignatureObject:
+    """Class representing an EESSI data file and its signature in remote storage and locally."""
+
+    # configuration
+    config: configparser.ConfigParser
+
+    # remote paths
+    remote_file_path: str  # path to data file in remote storage
+    remote_sig_path: str  # path to signature file in remote storage
+
+    # local paths
+    local_file_path: Path  # path to local data file
+    local_sig_path: Path  # path to local signature file
+
+    # remote storage client
+    remote_client: EESSIRemoteStorageClient
+
+    @log_function_entry_exit()
+    def __init__(
+        self,
+        config: configparser.ConfigParser,
+        remote_file_path: str,
+        remote_client: EESSIRemoteStorageClient,
+    ):
+        """
+        Initialize an EESSI data and signature object handler.
+
+        Args:
+            config: configuration object containing remote storage and local directory information
+            remote_file_path: path to data file in remote storage
+            remote_client: remote storage client implementing the EESSIRemoteStorageClient protocol
+        """
+        self.config = config
+        self.remote_file_path = remote_file_path
+        sig_ext = config["signatures"]["signature_file_extension"]
+        self.remote_sig_path = remote_file_path + sig_ext
+
+        # set up local paths
+        local_dir = Path(config["paths"]["download_dir"])
+        # use the full remote path structure, removing any leading slashes
+        remote_path = remote_file_path.lstrip("/")
+        self.local_file_path = local_dir.joinpath(remote_path)
+        self.local_sig_path = local_dir.joinpath(remote_path + sig_ext)
+        self.remote_client = remote_client
+
+        log_message(LoggingScope.DEBUG, "DEBUG", "Initialized EESSIDataAndSignatureObject for '%s'", remote_file_path)
+        log_message(LoggingScope.DEBUG, "DEBUG", "Local file path: '%s'", self.local_file_path)
+        log_message(LoggingScope.DEBUG, "DEBUG", "Local signature path: '%s'", self.local_sig_path)
+
+    @log_function_entry_exit()
+    def _get_etag_file_path(self, local_path: Path) -> Path:
+        """Get the path to the .etag file for a given local file."""
+        return local_path.with_suffix(".etag")
+
+    @log_function_entry_exit()
+    def _get_local_etag(self, local_path: Path) -> Optional[str]:
+        """Get the ETag of a local file from its .etag file."""
+        etag_path = self._get_etag_file_path(local_path)
+        if etag_path.exists():
+            try:
+                with open(etag_path, "r") as f:
+                    return f.read().strip()
+            except Exception as err:
+                log_message(LoggingScope.DEBUG, "WARNING", "Failed to read ETag file '%s': '%s'", etag_path, str(err))
+                return None
+        return None
+
+    @log_function_entry_exit()
+    def get_etags(self) -> tuple[Optional[str], Optional[str]]:
+        """
+        Get the ETags of both the data file and its signature.
+
+        Returns:
+            Tuple containing (data_file_etag, signature_file_etag)
+        """
+        return (
+            self._get_local_etag(self.local_file_path),
+            self._get_local_etag(self.local_sig_path)
+        )
+
+    @log_function_entry_exit()
+    def verify_signature(self) -> bool:
+        """
+        Verify the signature of the data file using the corresponding signature file.
+
+        Returns:
+            bool: True if the signature is valid or if signatures are not required, False otherwise
+        """
+        # check if signature file exists
+        if not self.local_sig_path.exists():
+            log_message(LoggingScope.VERIFICATION, "WARNING", "Signature file '%s' is missing",
+                        self.local_sig_path)
+
+            # if signatures are required, return failure
+            if self.config["signatures"].getboolean("signatures_required", True):
+                log_message(LoggingScope.ERROR, "ERROR", "Signature file '%s' is missing and signatures are required",
+                            self.local_sig_path)
+                return False
+            else:
+                log_message(LoggingScope.VERIFICATION, "INFO",
+                            "Signature file '%s' is missing, but signatures are not required",
+                            self.local_sig_path)
+                return True
+
+        # if signatures are provided, we should always verify them, regardless of the signatures_required setting
+        verify_runenv = self.config["signatures"]["signature_verification_runenv"].split()
+        verify_script = self.config["signatures"]["signature_verification_script"]
+        allowed_signers_file = self.config["signatures"]["allowed_signers_file"]
+
+        # check if verification tools exist
+        if not Path(verify_script).exists():
+            log_message(LoggingScope.ERROR, "ERROR",
+                        "Unable to verify signature: verification script '%s' does not exist", verify_script)
+            return False
+
+        if not Path(allowed_signers_file).exists():
+            log_message(LoggingScope.ERROR, "ERROR",
+                        "Unable to verify signature: allowed signers file '%s' does not exist", allowed_signers_file)
+            return False
+
+        # run the verification command with named parameters
+        cmd = verify_runenv + [
+            verify_script,
+            "--verify",
+            "--allowed-signers-file", allowed_signers_file,
+            "--file", str(self.local_file_path),
+            "--signature-file", str(self.local_sig_path)
+        ]
+        log_message(LoggingScope.VERIFICATION, "INFO", "Running command: '%s'", " ".join(cmd))
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                log_message(LoggingScope.VERIFICATION, "INFO",
+                            "Successfully verified signature for '%s'", self.local_file_path)
+                log_message(LoggingScope.VERIFICATION, "DEBUG", "  stdout: '%s'", result.stdout)
+                log_message(LoggingScope.VERIFICATION, "DEBUG", "  stderr: '%s'", result.stderr)
+                return True
+            else:
+                log_message(LoggingScope.ERROR, "ERROR",
+                            "Signature verification failed for '%s'", self.local_file_path)
+                log_message(LoggingScope.ERROR, "ERROR", "  stdout: '%s'", result.stdout)
+                log_message(LoggingScope.ERROR, "ERROR", "  stderr: '%s'", result.stderr)
+                return False
+        except Exception as err:
+            log_message(LoggingScope.ERROR, "ERROR",
+                        "Error during signature verification for '%s': '%s'",
+                        self.local_file_path, str(err))
+            return False
+
+    @log_function_entry_exit()
+    def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
+        """
+        Download data file and signature based on the specified mode.
+
+        Args:
+            mode: Download mode to use
+
+        Returns:
+            True if files were downloaded, False otherwise
+        """
+        # if mode is FORCE, we always download regardless of local or remote state
+        if mode == DownloadMode.FORCE:
+            should_download = True
+            log_message(LoggingScope.DOWNLOAD, "INFO", "Forcing download of '%s'", self.remote_file_path)
+        # for CHECK_REMOTE mode, check if we can optimize
+        elif mode == DownloadMode.CHECK_REMOTE:
+            # optimization: check if local files exist first
+            local_files_exist = (
+                self.local_file_path.exists() and
+                self.local_sig_path.exists()
+            )
+
+            # if files don't exist locally, we can skip ETag checks
+            if not local_files_exist:
+                log_message(LoggingScope.DOWNLOAD, "INFO",
+                            "Local files missing, skipping ETag checks and downloading '%s'",
+                            self.remote_file_path)
+                should_download = True
+            else:
+                # first check if we have local ETags
+                try:
+                    local_file_etag = self._get_local_etag(self.local_file_path)
+                    local_sig_etag = self._get_local_etag(self.local_sig_path)
+
+                    if local_file_etag:
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "Local file ETag: '%s'", local_file_etag)
+                    else:
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "No local file ETag found")
+                    if local_sig_etag:
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "Local signature ETag: '%s'", local_sig_etag)
+                    else:
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "No local signature ETag found")
+
+                    # if we don't have local ETags, we need to download
+                    if not local_file_etag or not local_sig_etag:
+                        should_download = True
+                        log_message(LoggingScope.DOWNLOAD, "INFO", "Missing local ETags, downloading '%s'",
+                                    self.remote_file_path)
+                    else:
+                        # get remote ETags and compare
+                        remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)["ETag"]
+                        remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)["ETag"]
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "Remote file ETag: '%s'", remote_file_etag)
+                        log_message(LoggingScope.DOWNLOAD, "DEBUG", "Remote signature ETag: '%s'", remote_sig_etag)
+
+                        should_download = (
+                            remote_file_etag != local_file_etag or
+                            remote_sig_etag != local_sig_etag
+                        )
+                        if should_download:
+                            if remote_file_etag != local_file_etag:
+                                log_message(LoggingScope.DOWNLOAD, "INFO", "File ETag changed from '%s' to '%s'",
+                                            local_file_etag, remote_file_etag)
+                            if remote_sig_etag != local_sig_etag:
+                                log_message(LoggingScope.DOWNLOAD, "INFO", "Signature ETag changed from '%s' to '%s'",
+                                            local_sig_etag, remote_sig_etag)
+                            log_message(LoggingScope.DOWNLOAD, "INFO", "Remote files have changed, downloading '%s'",
+                                        self.remote_file_path)
+                        else:
+                            log_message(LoggingScope.DOWNLOAD, "INFO",
+                                        "Remote files unchanged, skipping download of '%s'",
+                                        self.remote_file_path)
+                except Exception as etag_err:
+                    # if we get any error with ETags, we'll just download the files
+                    log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error handling ETags, will download files: '%s'",
+                                str(etag_err))
+                    should_download = True
+        else:  # check_local
+            should_download = (
+                not self.local_file_path.exists() or
+                not self.local_sig_path.exists()
+            )
+            if should_download:
+                if not self.local_file_path.exists():
+                    log_message(LoggingScope.DOWNLOAD, "INFO", "Local file missing: '%s'", self.local_file_path)
+                if not self.local_sig_path.exists():
+                    log_message(LoggingScope.DOWNLOAD, "INFO", "Local signature missing: '%s'", self.local_sig_path)
+                log_message(LoggingScope.DOWNLOAD, "INFO", "Local files missing, downloading '%s'",
+                            self.remote_file_path)
+            else:
+                log_message(LoggingScope.DOWNLOAD, "INFO", "Local files exist, skipping download of '%s'",
+                            self.remote_file_path)
+
+        if not should_download:
+            return False
+
+        # ensure local directory exists
+        self.local_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # download files
+        try:
+            # download the main file first
+            self.remote_client.download(self.remote_file_path, str(self.local_file_path))
+
+            # get and log the ETag of the downloaded file
+            try:
+                file_etag = self._get_local_etag(self.local_file_path)
+                log_message(LoggingScope.DOWNLOAD, "DEBUG", "Downloaded '%s' with ETag: '%s'",
+                            self.remote_file_path, file_etag)
+            except Exception as etag_err:
+                log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error getting ETag for '%s': '%s'",
+                            self.remote_file_path, str(etag_err))
+
+            # try to download the signature file
+            try:
+                self.remote_client.download(self.remote_sig_path, str(self.local_sig_path))
+                try:
+                    sig_etag = self._get_local_etag(self.local_sig_path)
+                    log_message(LoggingScope.DOWNLOAD, "DEBUG", "Downloaded '%s' with ETag: '%s'",
+                                self.remote_sig_path, sig_etag)
+                except Exception as etag_err:
+                    log_message(LoggingScope.DOWNLOAD, "DEBUG", "Error getting ETag for '%s': '%s'",
+                                self.remote_sig_path, str(etag_err))
+                log_message(LoggingScope.DOWNLOAD, "INFO", "Successfully downloaded '%s' and its signature",
+                            self.remote_file_path)
+            except Exception as sig_err:
+                # check if signatures are required
+                if self.config["signatures"].getboolean("signatures_required", True):
+                    # if signatures are required, clean up everything since we can't proceed
+                    if self.local_file_path.exists():
+                        self.local_file_path.unlink()
+                    # clean up etag files regardless of whether their data files exist
+                    file_etag_path = self._get_etag_file_path(self.local_file_path)
+                    if file_etag_path.exists():
+                        file_etag_path.unlink()
+                    sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+                    if sig_etag_path.exists():
+                        sig_etag_path.unlink()
+                    log_message(LoggingScope.ERROR, "ERROR", "Failed to download required signature for '%s': '%s'",
+                                self.remote_file_path, str(sig_err))
+                    raise
+                else:
+                    # if signatures are optional, just clean up any partial signature files
+                    if self.local_sig_path.exists():
+                        self.local_sig_path.unlink()
+                    sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+                    if sig_etag_path.exists():
+                        sig_etag_path.unlink()
+                    log_message(LoggingScope.DOWNLOAD, "WARNING",
+                                "Failed to download optional signature for '%s': '%s'",
+                                self.remote_file_path, str(sig_err))
+                    log_message(LoggingScope.DOWNLOAD, "INFO",
+                                "Successfully downloaded '%s' (signature optional)",
+                                self.remote_file_path)
+
+            return True
+        except Exception as err:
+            # this catch block is only for errors in the main file download
+            # clean up partially downloaded files and their etags
+            if self.local_file_path.exists():
+                self.local_file_path.unlink()
+            if self.local_sig_path.exists():
+                self.local_sig_path.unlink()
+            # clean up etag files regardless of whether their data files exist
+            file_etag_path = self._get_etag_file_path(self.local_file_path)
+            if file_etag_path.exists():
+                file_etag_path.unlink()
+            sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+            if sig_etag_path.exists():
+                sig_etag_path.unlink()
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to download '%s': '%s'", self.remote_file_path, str(err))
+            raise
+
+    @log_function_entry_exit()
+    def get_url(self) -> str:
+        """Get the URL of the data file."""
+        return f"https://{self.remote_client.bucket}.s3.amazonaws.com/{self.remote_file_path}"
+
+    def __str__(self) -> str:
+        """Return a string representation of the EESSI data and signature object."""
+        return f"EESSIDataAndSignatureObject({self.remote_file_path})"
diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
index f8131213..fa40fb33 100644
--- a/scripts/automated_ingestion/ingest_bundles.py
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# from eessi_data_object import EESSIDataAndSignatureObject
+from eessi_data_object import EESSIDataAndSignatureObject
 # from eessi_task import EESSITask, TaskState
 # from eessi_task_description import EESSITaskDescription
 from eessi_s3_bucket import EESSIS3Bucket
@@ -170,8 +170,9 @@ def main():
             for task_path in tasks:
                 log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path)
 
-#                try:
-#                    # Create EESSITask for the task file
+                try:
+                    _ = EESSIDataAndSignatureObject(config, task_path, s3_bucket)
+#                    # create EESSITask for the task file
 #                    try:
 #                        task = EESSITask(
 #                            EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)),
@@ -201,9 +202,9 @@ def main():
 #                                    "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'",
 #                                    task_path, previous_state.name, current_state.name)
 #
-#                except Exception as err:
-#                    log_message(LoggingScope.ERROR, "ERROR", "Failed to process task %s: %s", task_path, str(err))
-#                    continue
+                except Exception as err:
+                    log_message(LoggingScope.ERROR, "ERROR", "Failed to process task '%s': '%s'", task_path, str(err))
+                    continue
 
 
 @log_function_entry_exit()

From 79ef2671616bef2050e52a5ac24a97941806b0b1 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 17:49:32 +0200
Subject: [PATCH 15/26] add class to model the description of a task

---
 .../eessi_task_description.py                 | 188 ++++++++++++++++++
 scripts/automated_ingestion/ingest_bundles.py |   4 +-
 2 files changed, 190 insertions(+), 2 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_task_description.py

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
new file mode 100644
index 00000000..24cc6df7
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -0,0 +1,188 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import json
+
+from eessi_data_object import EESSIDataAndSignatureObject
+from eessi_logging import log_function_entry_exit, log_message, LoggingScope
+from eessi_remote_storage_client import DownloadMode
+
+
+@dataclass
+class EESSITaskDescription:
+    """Class representing an EESSI task to be performed, including its metadata and associated data files."""
+
+    # The EESSI data and signature object associated with this task
+    task_object: EESSIDataAndSignatureObject
+
+    # Whether the signature was successfully verified
+    signature_verified: bool = False
+
+    # Metadata from the task description file
+    metadata: Dict[str, Any] = None
+
+    # task element
+    task: Dict[str, Any] = None
+
+    # source element
+    source: Dict[str, Any] = None
+
+    @log_function_entry_exit()
+    def __init__(self, task_object: EESSIDataAndSignatureObject):
+        """
+        Initialize an EESSITaskDescription object.
+
+        Args:
+            task_object: The EESSI data and signature object associated with this task
+        """
+        self.task_object = task_object
+        self.metadata = {}
+
+        self.task_object.download(mode=DownloadMode.CHECK_REMOTE)
+
+        # verify signature and set initial state
+        self.signature_verified = self.task_object.verify_signature()
+
+        # try to read metadata (will only succeed if signature is verified)
+        try:
+            self._read_metadata()
+        except RuntimeError:
+            # expected if signature is not verified yet
+            pass
+
+        # check if the task file contains a task field and add that to self
+        if "task" in self.metadata:
+            self.task = self.metadata["task"]
+        else:
+            self.task = None
+
+        # check if the task file contains a link2pr field and add that to source element
+        if "link2pr" in self.metadata:
+            self.source = self.metadata["link2pr"]
+        else:
+            self.source = None
+
+    @log_function_entry_exit()
+    def get_contents(self) -> str:
+        """
+        Get the contents of the task description / metadata file.
+        """
+        return self.raw_contents
+
+    @log_function_entry_exit()
+    def get_metadata_filename_components(self) -> Tuple[str, str, str, str, str, str]:
+        """
+        Get the components of the metadata file name.
+
+        An example of the metadata file name is:
+          eessi-2023.06-software-linux-x86_64-amd-zen2-1745557626.tar.gz.meta.txt
+
+        The components are:
+          eessi: some prefix
+          VERSION: 2023.06
+          COMPONENT: software
+          OS: linux
+          ARCHITECTURE: x86_64-amd-zen2
+          TIMESTAMP: 1745557626
+          SUFFIX: tar.gz.meta.txt
+
+          The ARCHITECTURE component can include one to two hyphens.
+          The SUFFIX is the part after the first dot (no other components should include dots).
+        """
+        # obtain file name from local file path using basename
+        file_name = Path(self.task_object.local_file_path).name
+        # split file_name into part before suffix and the suffix
+        #   idea: split on last hyphen, then split on first dot
+        suffix = file_name.split("-")[-1].split(".", 1)[1]
+        file_name_without_suffix = file_name.strip(f".{suffix}")
+        # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element),
+        #  ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element)
+        components = file_name_without_suffix.split("-")
+        version = components[1]
+        component = components[2]
+        os = components[3]
+        architecture = "-".join(components[4:-1])
+        timestamp = components[-1]
+        return version, component, os, architecture, timestamp, suffix
+
+    @log_function_entry_exit()
+    def get_metadata_value(self, key: str) -> str:
+        """
+        Get the value of a key from the task description / metadata file.
+        """
+        # check that key is defined and has a length > 0
+        if not key or len(key) == 0:
+            raise ValueError("get_metadata_value: key is not defined or has a length of 0")
+
+        value = None
+        task = self.task
+        source = self.source
+        # check if key is in task or source
+        if task and key in task:
+            value = task[key]
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        f"Value '{value}' for key '{key}' found in information from task metadata: {task}")
+        elif source and key in source:
+            value = source[key]
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        f"Value '{value}' for key '{key}' found in information from source metadata: {source}")
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        f"Value for key '{key}' neither found in task metadata nor source metadata")
+            raise ValueError(f"Value for key '{key}' neither found in task metadata nor source metadata")
+        return value
+
+    @log_function_entry_exit()
+    def get_pr_number(self) -> str:
+        """
+        Get the PR number from the task description / metadata file.
+        """
+        return self.get_metadata_value("pr")
+
+    @log_function_entry_exit()
+    def get_repo_name(self) -> str:
+        """
+        Get the repository name from the task description / metadata file.
+        """
+        return self.get_metadata_value("repo")
+
+    @log_function_entry_exit()
+    def get_task_file_name(self) -> str:
+        """
+        Get the file name from the task description / metadata file.
+        """
+        # get file name from remote file path using basename
+        file_name = Path(self.task_object.remote_file_path).name
+        return file_name
+
+    @log_function_entry_exit()
+    def _read_metadata(self) -> None:
+        """
+        Internal method to read and parse the metadata from the task description file.
+        Only reads metadata if the signature has been verified.
+        """
+        if not self.signature_verified:
+            log_message(LoggingScope.ERROR, "ERROR", "Cannot read metadata: signature not verified for '%s'",
+                        self.task_object.local_file_path)
+            raise RuntimeError("Cannot read metadata: signature not verified")
+
+        try:
+            with open(self.task_object.local_file_path, "r") as file:
+                self.raw_contents = file.read()
+                self.metadata = json.loads(self.raw_contents)
+            log_message(LoggingScope.DEBUG, "DEBUG", "Successfully read metadata from '%s'",
+                        self.task_object.local_file_path)
+        except json.JSONDecodeError as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to parse JSON in task description file '%s': '%s'",
+                        self.task_object.local_file_path, str(err))
+            raise
+        except Exception as err:
+            log_message(LoggingScope.ERROR, "ERROR", "Failed to read task description file '%s': '%s'",
+                        self.task_object.local_file_path, str(err))
+            raise
+
+    @log_function_entry_exit()
+    def __str__(self) -> str:
+        """Return a string representation of the EESSITaskDescription object."""
+        return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})"
diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
index fa40fb33..0662fc83 100644
--- a/scripts/automated_ingestion/ingest_bundles.py
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -2,7 +2,7 @@
 
 from eessi_data_object import EESSIDataAndSignatureObject
 # from eessi_task import EESSITask, TaskState
-# from eessi_task_description import EESSITaskDescription
+from eessi_task_description import EESSITaskDescription
 from eessi_s3_bucket import EESSIS3Bucket
 from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes
 from pid.decorator import pidfile  # noqa: F401
@@ -171,7 +171,7 @@ def main():
                 log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path)
 
                 try:
-                    _ = EESSIDataAndSignatureObject(config, task_path, s3_bucket)
+                    _ = EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket))
 #                    # create EESSITask for the task file
 #                    try:
 #                        task = EESSITask(

From 519b94f56e77dd4d7d46e6e071acd65adf7d13e9 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 17:56:29 +0200
Subject: [PATCH 16/26] add class to model different types of actions on CVMFS
 repo

---
 scripts/automated_ingestion/eessi_task_action.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 scripts/automated_ingestion/eessi_task_action.py

diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py
new file mode 100644
index 00000000..6f141435
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_action.py
@@ -0,0 +1,12 @@
+from enum import Enum, auto
+
+
+class EESSITaskAction(Enum):
+    NOP = auto()  # perform no action
+    DELETE = auto()  # perform a delete operation
+    ADD = auto()  # perform an add operation
+    UPDATE = auto()  # perform an update operation
+    UNKNOWN = auto()  # unknown action
+
+    def __str__(self):
+        return self.name.lower()

From f3fce42cbcafbf8df234a089949ad6774c5fe03c Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 18:02:18 +0200
Subject: [PATCH 17/26] add class to model payload of a deployment task

---
 .../automated_ingestion/eessi_task_payload.py | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 scripts/automated_ingestion/eessi_task_payload.py

diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
new file mode 100644
index 00000000..fe0db162
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -0,0 +1,109 @@
+from dataclasses import dataclass
+from pathlib import PurePosixPath
+from typing import Dict
+
+import os
+import tarfile
+
+from eessi_data_object import EESSIDataAndSignatureObject
+from eessi_logging import log_function_entry_exit
+from eessi_remote_storage_client import DownloadMode
+
+
+@dataclass
+class EESSITaskPayload:
+    """Class representing an EESSI task payload (tarball/artifact) and its signature."""
+
+    # The EESSI data and signature object associated with this payload
+    payload_object: EESSIDataAndSignatureObject
+
+    # Whether the signature was successfully verified
+    signature_verified: bool = False
+
+    # possibly at a later point in time, we will add inferred metadata here
+    # such as the prefix in a tarball, the main elements, or which software
+    # package it includes
+
+    @log_function_entry_exit()
+    def __init__(self, payload_object: EESSIDataAndSignatureObject):
+        """
+        Initialize an EESSITaskPayload object.
+
+        Args:
+            payload_object: The EESSI data and signature object associated with this payload
+        """
+        self.payload_object = payload_object
+
+        # download the payload and its signature
+        self.payload_object.download(mode=DownloadMode.CHECK_REMOTE)
+
+        # verify signature
+        self.signature_verified = self.payload_object.verify_signature()
+
+    @log_function_entry_exit()
+    def analyse_contents(self, config: Dict) -> str:
+        """Analyse the contents of the payload and return a summary in a ready-to-use HTML format."""
+        tar = tarfile.open(self.payload_object.local_file_path, "r")
+        members = tar.getmembers()
+        tar_num_members = len(members)
+        paths = sorted([m.path for m in members])
+
+        if tar_num_members < 100:
+            tar_members_desc = "Full listing of the contents of the tarball:"
+            members_list = paths
+
+        else:
+            tar_members_desc = "Summarized overview of the contents of the tarball:"
+            # determine prefix after filtering out '<EESSI version>/init' subdirectory,
+            # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1')
+            init_subdir = os.path.join("*", "init")
+            non_init_paths = sorted(
+                [path for path in paths if not any(parent.match(init_subdir) for parent in PurePosixPath(path).parents)]
+            )
+            if non_init_paths:
+                prefix = os.path.commonprefix(non_init_paths)
+            else:
+                prefix = os.path.commonprefix(paths)
+
+            # TODO: this only works for software tarballs, how to handle compat layer tarballs?
+            swdirs = [  # all directory names with the pattern: <prefix>/software/<name>/<version>
+                member.path
+                for member in members
+                if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, "software", "*", "*"))
+            ]
+            modfiles = [  # all filenames with the pattern: <prefix>/modules/<category>/<name>/*.lua
+                member.path
+                for member in members
+                if member.isfile()
+                and PurePosixPath(member.path).match(os.path.join(prefix, "modules", "*", "*", "*.lua"))
+            ]
+            other = [  # anything that is not in <prefix>/software nor <prefix>/modules
+                member.path
+                for member in members
+                if (
+                    not PurePosixPath(prefix).joinpath("software") in PurePosixPath(member.path).parents
+                    and not PurePosixPath(prefix).joinpath("modules") in PurePosixPath(member.path).parents
+                )
+                # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*'))
+                # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*'))
+            ]
+            members_list = sorted(swdirs + modfiles + other)
+
+        # construct the overview
+        overview = config["github"]["task_summary_payload_overview_template"].format(
+            tar_num_members=tar_num_members,
+            bucket_url=self.payload_object.remote_client.get_bucket_url(),
+            remote_file_path=self.payload_object.remote_file_path,
+            tar_members_desc=tar_members_desc,
+            tar_members="\n".join(members_list)
+        )
+
+        # make sure that the overview does not exceed Github's maximum length (65536 characters)
+        if len(overview) > 60000:
+            overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```"
+        return overview
+
+    @log_function_entry_exit()
+    def __str__(self) -> str:
+        """Return a string representation of the EESSITaskPayload object."""
+        return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})"

From b37901229268d7f380d64d89600cab500e3ae853 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 19:04:28 +0200
Subject: [PATCH 18/26] add class modelling a deployment task

---
 scripts/automated_ingestion/eessi_task.py     | 1393 +++++++++++++++++
 scripts/automated_ingestion/ingest_bundles.py |   38 +-
 2 files changed, 1411 insertions(+), 20 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_task.py

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
new file mode 100644
index 00000000..f2369db6
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -0,0 +1,1393 @@
+from enum import Enum, auto
+from functools import total_ordering
+from typing import Dict, List, Tuple, Optional, Any
+
+import base64
+import os
+import subprocess
+import traceback
+
+from eessi_data_object import EESSIDataAndSignatureObject
+from eessi_logging import log_function_entry_exit, log_message, LoggingScope
+from eessi_task_action import EESSITaskAction
+from eessi_task_description import EESSITaskDescription
+from eessi_task_payload import EESSITaskPayload
+from utils import send_slack_message
+
+from github import Github, GithubException, InputGitTreeElement, UnknownObjectException
+from github.Branch import Branch
+from github.PullRequest import PullRequest
+
+
+@total_ordering
+class EESSITaskState(Enum):
+    UNDETERMINED = auto()  # The task state was not determined yet
+    NEW_TASK = auto()  # The task has been created but not yet processed
+    PAYLOAD_STAGED = auto()  # The task's payload has been staged to the Stratum-0
+    PULL_REQUEST = auto()  # A PR for the task has been created or updated in some staging repository
+    APPROVED = auto()  # The PR for the task has been approved
+    REJECTED = auto()  # The PR for the task has been rejected
+    INGESTED = auto()  # The task's payload has been applied to the target CernVM-FS repository
+    DONE = auto()  # The task has been completed
+
+    @classmethod
+    def from_string(
+        cls, name: str, default: Optional["EESSITaskState"] = None, case_sensitive: bool = False
+    ) -> "EESSITaskState":
+        log_message(LoggingScope.TASK_OPS, "INFO", "from_string: '%s'", name)
+        if case_sensitive:
+            to_return = cls.__members__.get(name, default)
+            log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return)
+            return to_return
+
+        try:
+            to_return = cls[name.upper()]
+            log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return)
+            return to_return
+        except KeyError:
+            return default
+
+    def __lt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value < other.value
+        return NotImplemented
+
+    def __str__(self):
+        return self.name.upper()
+
+
+class EESSITask:
+    description: EESSITaskDescription
+    payload: EESSITaskPayload
+    action: EESSITaskAction
+    git_repo: Github
+    config: Dict
+
+    @log_function_entry_exit()
+    def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: str, git_repo: Github):
+        self.description = description
+        self.config = config
+        self.cvmfs_repo = cvmfs_repo
+        self.git_repo = git_repo
+        self.action = self._determine_task_action()
+
+        # define valid state transitions for all actions
+        # NOTE, for EESSITaskState.PULL_REQUEST, EESSITaskState.APPROVED must be the first element or
+        #   _next_state() will not work correctly
+        self.valid_transitions = {
+            EESSITaskState.UNDETERMINED: [
+                EESSITaskState.NEW_TASK,
+                EESSITaskState.PAYLOAD_STAGED,
+                EESSITaskState.PULL_REQUEST,
+                EESSITaskState.APPROVED,
+                EESSITaskState.REJECTED,
+                EESSITaskState.INGESTED,
+                EESSITaskState.DONE,
+            ],
+            EESSITaskState.NEW_TASK: [EESSITaskState.PAYLOAD_STAGED],
+            EESSITaskState.PAYLOAD_STAGED: [EESSITaskState.PULL_REQUEST],
+            EESSITaskState.PULL_REQUEST: [EESSITaskState.APPROVED, EESSITaskState.REJECTED],
+            EESSITaskState.APPROVED: [EESSITaskState.INGESTED],
+            EESSITaskState.REJECTED: [],  # terminal state
+            EESSITaskState.INGESTED: [],  # terminal state
+            EESSITaskState.DONE: []  # virtual terminal state, not used to write on GitHub
+        }
+
+        self.payload = None
+        state = self.determine_state()
+        if state >= EESSITaskState.PAYLOAD_STAGED:
+            log_message(LoggingScope.TASK_OPS, "INFO", "initializing payload object in constructor for EESSITask")
+            self._init_payload_object()
+
+    @log_function_entry_exit()
+    def _determine_task_action(self) -> EESSITaskAction:
+        """
+        Determine the action type based on task description metadata.
+        """
+        if "task" in self.description.metadata and "action" in self.description.metadata["task"]:
+            action_str = self.description.metadata["task"]["action"].lower()
+            if action_str == "nop":
+                return EESSITaskAction.NOP
+            elif action_str == "delete":
+                return EESSITaskAction.DELETE
+            elif action_str == "add":
+                return EESSITaskAction.ADD
+            elif action_str == "update":
+                return EESSITaskAction.UPDATE
+        return EESSITaskAction.UNKNOWN
+
+    @log_function_entry_exit()
+    def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool:
+        """
+        Check if a file exists in a repository branch.
+
+        Args:
+            file_path_prefix: the prefix of the file path
+            branch_name: the branch to check
+
+        Returns:
+            True if a file with the prefix exists in the branch, False otherwise
+        """
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        # branch = self._get_branch_from_name(branch_name)
+        try:
+            # get all files in directory part of file_path_prefix
+            directory_part = os.path.dirname(file_path_prefix)
+            files = self.git_repo.get_contents(directory_part, ref=branch_name)
+            log_msg = "Found files %s in directory %s in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name)
+            # check if any of the files has file_path_prefix as prefix
+            for file in files:
+                if file.path.startswith(file_path_prefix):
+                    log_msg = "Found file %s in directory %s in branch %s"
+                    log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name)
+                    return True
+            log_msg = "No file with prefix %s found in directory %s in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name)
+            return False
+        except UnknownObjectException:
+            # file_path does not exist in branch
+            log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
+            return False
+        except GithubException as err:
+            if err.status == 404:
+                # file_path does not exist in branch
+                log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
+                log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
+                return False
+            else:
+                # if there was some other (e.g. connection) issue, log message and return False
+                log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
+                log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
+                return False
+        return False
+
+    @log_function_entry_exit()
+    def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]:
+        """
+        Determines in which sequence numbers the metadata/task file is included and in which it is not.
+        NOTE, we only need to check the default branch of the repository, because a for a new task a file
+        is added to the default branch and for the subsequent processing of the task we use a different branch.
+        Thus, until the PR is closed, the task file stays in the default branch.
+
+        Args:
+            repo: the repository name
+            pr: the pull request number
+
+        Returns:
+            A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is
+            included in that sequence number.
+
+        Idea:
+         - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique
+           sequence number.
+         - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named
+           `REPO/PR_NUM/SEQ_NUM` is created.
+         - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`.
+         - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory.
+         - We iterate over all directories under `REPO/PR_NUM`:
+           - If the metadata/task file is available in the directory, we add the sequence number to the list.
+
+        Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number.
+        """
+        sequence_numbers = {}
+        repo_pr_dir = f"{repo}/{pr}"
+        # iterate over all directories under repo_pr_dir
+        try:
+            directories = self._list_directory_contents(repo_pr_dir)
+            for dir in directories:
+                # check if the directory is a number
+                if dir.name.isdigit():
+                    # determine if a state file with prefix exists in the sequence number directory
+                    #   we need to use the basename of the remote file path
+                    remote_file_path_basename = os.path.basename(self.description.task_object.remote_file_path)
+                    state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path_basename}"
+                    if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix):
+                        sequence_numbers[int(dir.name)] = True
+                    else:
+                        sequence_numbers[int(dir.name)] = False
+                else:
+                    # directory is not a number, so we skip it
+                    continue
+        except FileNotFoundError:
+            # repo_pr_dir does not exist, so we return an empty dictionary
+            return {}
+        except GithubException as err:
+            if err.status != 404:  # 404 is catched by FileNotFoundError
+                # some other error than the directory not existing
+                return {}
+        return sequence_numbers
+
+    @log_function_entry_exit()
+    def _find_highest_number(self, str_list: List[str]) -> int:
+        """
+        Find the highest number in a list of strings.
+        """
+        # Convert all strings to integers
+        int_list = [int(num) for num in str_list]
+        return max(int_list)
+
+    @log_function_entry_exit()
+    def _get_sequence_number_for_task_file(self) -> int:
+        """
+        Get the sequence number this task is assigned to at the moment.
+        NOTE, should only be called if the task is actually assigned to a sequence number.
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            raise ValueError("Found no sequence numbers at all")
+        else:
+            # get all entries with value True, there should be only one, so we return the first one
+            sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True]
+            if len(sequence_numbers_true) == 0:
+                raise ValueError("Found no sequence numbers that include the task file for task %s",
+                                 self.description)
+            else:
+                return sequence_numbers_true[0]
+
+    @log_function_entry_exit()
+    def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int:
+        """
+        Get the current sequence number based on the sequence numbers.
+        If sequence_numbers is not provided, we determine the sequence numbers from the task description.
+        """
+        if sequence_numbers is None:
+            repo_name = self.description.get_repo_name()
+            pr_number = self.description.get_pr_number()
+            sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            return 0
+        return self._find_highest_number(sequence_numbers.keys())
+
+    @log_function_entry_exit()
+    def _get_fixed_sequence_number(self) -> int:
+        """
+        Get a fixed sequence number.
+        """
+        return 11
+
+    @log_function_entry_exit()
+    def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]:
+        """
+        Find the staging PR for the task.
+        TODO: arg sequence number --> make function simpler
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        try:
+            sequence_number = self._get_sequence_number_for_task_file()
+        except ValueError:
+            # no sequence number found, so we return None
+            log_message(LoggingScope.ERROR, "ERROR", "no sequence number found for task '%s'", self.description)
+            return None, None, None
+        except Exception as err:
+            # some other error
+            log_message(LoggingScope.ERROR, "ERROR", "error finding staging PR for task '%s': '%s'",
+                        self.description, err)
+            return None, None, None
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
+            find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state="all")]
+            if find_pr:
+                pr = find_pr.pop(0)
+                return pr, branch_name, sequence_number
+            else:
+                return None, branch_name, sequence_number
+        else:
+            return None, None, None
+
+    @log_function_entry_exit()
+    def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]:
+        """
+        Create a staging PR for the task.
+        NOTE, SHALL only be called if no staging PR for the task exists yet.
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        default_branch_name = self.git_repo.default_branch
+        pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                       body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                       head=branch_name, base=default_branch_name)
+        return pr, branch_name
+
+    @log_function_entry_exit()
+    def _find_state(self) -> EESSITaskState:
+        """
+        Determine the state of the task based on the task description metadata.
+
+        Returns:
+            The state of the task.
+        """
+        # obtain repo and pr from metadata
+        log_message(LoggingScope.TASK_OPS, "INFO", "finding state of task '%s'", self.description.task_object)
+        repo = self.description.get_repo_name()
+        pr = self.description.get_pr_number()
+        log_message(LoggingScope.TASK_OPS, "INFO", "repo: '%s', pr: '%s'", repo, pr)
+
+        # obtain all sequence numbers in repo/pr dir which include a state file for this task
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
+        if len(sequence_numbers) == 0:
+            # no sequence numbers found, so we return NEW_TASK
+            log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, state: NEW_TASK")
+            return EESSITaskState.NEW_TASK
+        # we got at least one sequence number
+        # if one value for a sequence number is True, we can determine the state from the file in the directory
+        sequence_including_task = [key for key, value in sequence_numbers.items() if value is True]
+        if len(sequence_including_task) == 0:
+            # no sequence number includes the task file, so we return NEW_TASK
+            log_message(LoggingScope.TASK_OPS, "INFO", "no sequence number includes the task file, state: NEW_TASK")
+            return EESSITaskState.NEW_TASK
+        # we got at least one sequence number which includes the task file
+        # we can determine the state from the filename in the directory
+        # NOTE, we use the first element in sequence_including_task (there should be only one)
+        #     we ignore other elements in sequence_including_task
+        sequence_number = sequence_including_task[0]
+        task_file_name = self.description.get_task_file_name()
+        metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}."
+        state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number)
+        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+        return state
+
+    @log_function_entry_exit()
+    def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str,
+                                            sequence_number: int) -> EESSITaskState:
+        """
+        Get the state from the file in the metadata_file_state_path_prefix.
+        """
+        # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED,
+        # INGESTED, DONE)
+        # we need to check the task file in the default branch or in the branch corresponding to the sequence number
+        directory_part = os.path.dirname(metadata_file_state_path_prefix)
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        default_branch_name = self.git_repo.default_branch
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        all_branch_names = [branch.name for branch in self.git_repo.get_branches()]
+        states = []
+        for branch in [default_branch_name, branch_name]:
+            if branch in all_branch_names:
+                # first get all files in directory part of metadata_file_state_path_prefix
+                files = self._list_directory_contents(directory_part, branch)
+                # check if any of the files has metadata_file_state_path_prefix as prefix
+                for file in files:
+                    if file.path.startswith(metadata_file_state_path_prefix):
+                        # get state from file name taking only the suffix
+                        state = EESSITaskState.from_string(file.name.split(".")[-1])
+                        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+                        states.append(state)
+        if len(states) == 0:
+            # did not find any file with metadata_file_state_path_prefix as prefix
+            log_message(LoggingScope.TASK_OPS, "INFO", "did not find any file with prefix '%s'",
+                        metadata_file_state_path_prefix)
+            return EESSITaskState.NEW_TASK
+        # sort the states and return the last one
+        states.sort()
+        state = states[-1]
+        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+        return state
+
+    @log_function_entry_exit()
+    def _list_directory_contents(self, directory_path: str, branch_name: str = None) -> List[Any]:
+        """
+        List the contents of a directory in a branch.
+        """
+        try:
+            # Get contents of the directory
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+            log_message(LoggingScope.TASK_OPS, "INFO", "listing contents of '%s' in branch '%s'",
+                        directory_path, branch_name)
+            contents = self.git_repo.get_contents(directory_path, ref=branch_name)
+
+            # If contents is a list, it means we successfully got directory contents
+            if isinstance(contents, list):
+                return contents
+            else:
+                # If it's not a list, it means the path is not a directory
+                raise ValueError(f"'{directory_path}' is not a directory")
+        except GithubException as err:
+            if err.status == 404:
+                raise FileNotFoundError(f"Directory not found: '{directory_path}'")
+            raise err
+
+    @log_function_entry_exit()
+    def _next_state(self, state: EESSITaskState = None) -> EESSITaskState:
+        """
+        Determine the next state based on the current state using the valid_transitions dictionary.
+
+        NOTE, it assumes that function is only called for non-terminal states and that the next state is the first
+        element of the list returned by the valid_transitions dictionary.
+        """
+        the_state = state if state is not None else self.determine_state()
+        return self.valid_transitions[the_state][0]
+
+    @log_function_entry_exit()
+    def _path_exists_in_branch(self, path: str, branch_name: str = None) -> bool:
+        """
+        Check if a path exists in a branch.
+        """
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        try:
+            self.git_repo.get_contents(path, ref=branch_name)
+            return True
+        except GithubException as err:
+            if err.status == 404:
+                return False
+            else:
+                raise err
+
+    @log_function_entry_exit()
+    def _read_dict_from_string(self, content: str) -> dict:
+        """
+        Read the dictionary from the string.
+        """
+        config_dict = {}
+        for line in content.strip().split("\n"):
+            if "=" in line and not line.strip().startswith("#"):  # Skip comments
+                key, value = line.split("=", 1)  # Split only on first '='
+                config_dict[key.strip()] = value.strip()
+        return config_dict
+
+    @log_function_entry_exit()
+    def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch_name: str = None) -> str:
+        """
+        Read the pull request directory from the file in the given branch.
+        """
+        # set default values for task pointer file and branch name
+        if task_pointer_file is None:
+            task_pointer_file = self.description.task_object.remote_file_path
+        if branch_name is None:
+            branch_name = self.git_repo.default_branch
+        log_message(LoggingScope.TASK_OPS, "INFO", "reading pull request directory from file '%s' in branch '%s'",
+                    task_pointer_file, branch_name)
+
+        # read the pull request directory from the file in the given branch
+        content = self.git_repo.get_contents(task_pointer_file, ref=branch_name)
+
+        # Decode the content from base64
+        content_str = content.decoded_content.decode("utf-8")
+
+        # Parse into dictionary
+        config_dict = self._read_dict_from_string(content_str)
+
+        target_dir = config_dict.get("target_dir", None)
+        return config_dict.get("pull_request_dir", target_dir)
+
+    @log_function_entry_exit()
+    def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str:
+        """Determine the pull request directory via the task pointer file"""
+        return self._read_pull_request_dir_from_file(task_pointer_file=task_pointer_file, branch_name=branch_name)
+
+    @log_function_entry_exit()
+    def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
+        """
+        Get a branch object from its name.
+        """
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+
+        try:
+            branch = self.git_repo.get_branch(branch_name)
+            log_message(LoggingScope.TASK_OPS, "INFO", "branch '%s' exists: '%s'", branch_name, branch)
+            return branch
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "error checking if branch '%s' exists: '%s'",
+                        branch_name, err)
+            return None
+
+    @log_function_entry_exit()
+    def _read_task_state_from_file(self, path: str, branch_name: str = None) -> EESSITaskState:
+        """
+        Read the task state from the file in the given branch.
+        """
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        content = self.git_repo.get_contents(path, ref=branch_name)
+
+        # Decode the content from base64
+        content_str = content.decoded_content.decode("utf-8").strip()
+        log_message(LoggingScope.TASK_OPS, "INFO", "content in TaskState file: '%s'", content_str)
+
+        task_state = EESSITaskState.from_string(content_str)
+        log_message(LoggingScope.TASK_OPS, "INFO", "task state: '%s'", task_state)
+
+        return task_state
+
+    @log_function_entry_exit()
+    def determine_state(self, branch: str = None) -> EESSITaskState:
+        """
+        Determine the state of the task based on the state of the staging repository.
+        """
+        # check if path representing the task file exists in the default branch or the "feature" branch
+        task_pointer_file = self.description.task_object.remote_file_path
+        branch_to_use = self.git_repo.default_branch if branch is None else branch
+
+        if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use):
+            log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' exists in branch '%s'",
+                        task_pointer_file, branch_to_use)
+
+            # get state from task file in branch to use
+            # - read the EESSITaskState file in pull request directory
+            pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use)
+            log_message(LoggingScope.TASK_OPS, "INFO", "pull request directory: '%s'", pull_request_dir)
+            task_state_file_path = f"{pull_request_dir}/TaskState"
+            log_message(LoggingScope.TASK_OPS, "INFO", "task state file path: '%s'", task_state_file_path)
+            task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
+
+            log_message(LoggingScope.TASK_OPS, "INFO", "task state in branch '%s': %s",
+                        branch_to_use, task_state)
+            return task_state
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' does not exist in branch '%s'",
+                        task_pointer_file, branch_to_use)
+            return EESSITaskState.UNDETERMINED
+
+    @log_function_entry_exit()
+    def handle(self):
+        """
+        Dynamically find and execute the appropriate handler based on action and state.
+        """
+        state_before_handle = self.determine_state()
+
+        # Construct handler method name
+        handler_name = f"_handle_{self.action}_{str(state_before_handle).lower()}"
+
+        # Check if the handler exists
+        handler = getattr(self, handler_name, None)
+
+        if handler and callable(handler):
+            # Execute the handler if it exists
+            return handler()
+        else:
+            # Default behavior for missing handlers
+            log_message(LoggingScope.TASK_OPS, "ERROR",
+                        "No handler for action '%s' and state '%s' implemented; nothing to be done",
+                        self.action, state_before_handle)
+            return state_before_handle
+
+    # Implement handlers for ADD action
+    @log_function_entry_exit()
+    def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None):
+        """Create a file in the given branch."""
+        try:
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+            existing_file = self.git_repo.get_contents(path, ref=branch_name)
+            log_message(LoggingScope.TASK_OPS, "INFO", "File '%s' already exists", path)
+            return existing_file
+        except GithubException as err:
+            if err.status == 404:  # File doesn't exist
+                # Safe to create
+                return self.git_repo.create_file(path, message, content, branch=branch_name)
+            else:
+                raise err  # Some other error
+
+    @log_function_entry_exit()
+    def _create_multi_file_commit(self, files_data, commit_message, branch_name: str = None):
+        """
+        Create a commit with multiple file changes
+
+        files_data: dict with structure:
+        {
+            "path/to/file1.txt": {
+                "content": "file content",
+                "mode": "100644"  # optional, defaults to 100644
+            },
+            "path/to/file2.py": {
+                "content": "print('hello')",
+                "mode": "100644"
+            }
+        }
+        """
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        ref = self.git_repo.get_git_ref(f"heads/{branch_name}")
+        current_commit = self.git_repo.get_git_commit(ref.object.sha)
+        base_tree = current_commit.tree
+
+        # Create tree elements
+        tree_elements = []
+        for file_path, file_info in files_data.items():
+            content = file_info["content"]
+            if isinstance(content, str):
+                content = content.encode("utf-8")
+
+            blob = self.git_repo.create_git_blob(
+                base64.b64encode(content).decode("utf-8"),
+                "base64"
+            )
+            tree_elements.append(InputGitTreeElement(
+                path=file_path,
+                mode=file_info.get("mode", "100644"),
+                type="blob",
+                sha=blob.sha
+            ))
+
+        # Create new tree
+        new_tree = self.git_repo.create_git_tree(tree_elements, base_tree)
+
+        # Create commit
+        new_commit = self.git_repo.create_git_commit(
+            commit_message,
+            new_tree,
+            [current_commit]
+        )
+
+        # Update branch reference
+        ref.edit(new_commit.sha)
+
+        return new_commit
+
+    @log_function_entry_exit()
+    def _update_file(
+        self, file_path: str, new_content: str, commit_message: str, branch_name: str = None
+    ) -> Optional[Dict]:
+        try:
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+
+            # get the current file
+            file = self.git_repo.get_contents(file_path, ref=branch_name)
+
+            # update the file
+            result = self.git_repo.update_file(
+                path=file_path,
+                message=commit_message,
+                content=new_content,
+                sha=file.sha,
+                branch=branch_name
+            )
+
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "File updated successfully. Commit SHA: '%s'", result["commit"].sha)
+            return result
+
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Error updating file: '%s'", err)
+            return None
+
+    @log_function_entry_exit()
+    def _sorted_list_of_sequence_numbers(self) -> List[int]:
+        """Create a sorted list of sequence numbers from the pull requests directory"""
+        # a pull request's directory is of the form REPO/PR/SEQ
+        # hence, we can get all sequence numbers from the pull requests directory REPO/PR
+        sequence_numbers = []
+        repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}"
+
+        # iterate over all directories under repo_pr_dir
+        try:
+            directories = self._list_directory_contents(repo_pr_dir)
+            for dir in directories:
+                # check if the directory is a number
+                if dir.name.isdigit():
+                    sequence_numbers.append(int(dir.name))
+                else:
+                    # directory is not a number, so we skip it
+                    continue
+        except FileNotFoundError:
+            # repo_pr_dir does not exist, so we return an empty dictionary
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Pull requests directory '%s' does not exist", repo_pr_dir)
+        except GithubException as err:
+            if err.status != 404:  # 404 is catched by FileNotFoundError
+                # some other error than the directory not existing
+                log_message(LoggingScope.TASK_OPS, "ERROR",
+                            "Some other error than the directory not existing: '%s'", err)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Unexpected error: '%s'", err)
+
+        return sorted(sequence_numbers)
+
+    @log_function_entry_exit()
+    def _determine_sequence_number(self) -> int:
+        """Determine the sequence number for the task"""
+
+        sequence_numbers = self._sorted_list_of_sequence_numbers()
+        log_message(LoggingScope.TASK_OPS, "INFO", "number of sequence numbers: %d", len(sequence_numbers))
+        if len(sequence_numbers) == 0:
+            return 0
+
+        log_message(LoggingScope.TASK_OPS, "INFO", "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers)))
+
+        # get the highest sequence number
+        highest_sequence_number = sequence_numbers[-1]
+        log_message(LoggingScope.TASK_OPS, "INFO", "highest sequence number: %d", highest_sequence_number)
+
+        pull_request = self._find_pr_for_sequence_number(highest_sequence_number)
+        log_message(LoggingScope.TASK_OPS, "INFO", "pull request: '%s'", pull_request)
+
+        if pull_request is None:
+            log_message(LoggingScope.TASK_OPS, "INFO", "Did not find pull request for sequence number %d",
+                        highest_sequence_number)
+            # the directory for the sequence number exists but no PR yet
+            return highest_sequence_number
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO", "pull request found: '%s'", pull_request)
+            log_message(LoggingScope.TASK_OPS, "INFO", "pull request state/merged: '%s/%s'",
+                        pull_request.state, str(pull_request.is_merged()))
+            if pull_request.is_merged():
+                # the PR is merged, so we use the next sequence number
+                return highest_sequence_number + 1
+            else:
+                # the PR is not merged, so we can use the current sequence number
+                return highest_sequence_number
+
+    @log_function_entry_exit()
+    def _handle_add_undetermined(self):
+        """Handler for ADD action in UNDETERMINED state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in UNDETERMINED state: '%s'",
+                    self.description.get_task_file_name())
+        # task is in state UNDETERMINED if there is no pull request directory for the task yet
+        #
+        # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/)
+        # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription)
+        # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState)
+        # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR)
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_number = self._determine_sequence_number()  # corresponds to an open or yet to be created PR
+        task_file_name = self.description.get_task_file_name()
+        # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file
+        #   and we don't have one yet
+        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        task_description_file_path = f"{pull_request_dir}/TaskDescription"
+        task_state_file_path = f"{pull_request_dir}/TaskState"
+        remote_file_path = self.description.task_object.remote_file_path
+
+        files_to_commit = {
+            task_description_file_path: {
+                "content": self.description.get_contents(),
+                "mode": "100644"
+            },
+            task_state_file_path: {
+                "content": f"{EESSITaskState.NEW_TASK.name}\n",
+                "mode": "100644"
+            },
+            remote_file_path: {
+                "content": f"remote_file_path = {remote_file_path}\npull_request_dir = {pull_request_dir}",
+                "mode": "100644"
+            }
+        }
+
+        branch_name = self.git_repo.default_branch
+        try:
+            commit = self._create_multi_file_commit(
+                files_to_commit,
+                f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
+                branch_name=branch_name
+            )
+            log_message(LoggingScope.TASK_OPS, "INFO", "commit created: '%s'", commit)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Error creating commit: '%s'", err)
+            # TODO: rollback previous changes (task description file, task state file)
+            return EESSITaskState.UNDETERMINED
+
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
+        #   is still open or yet to be created); if it is not valid, perform corrective actions
+        return EESSITaskState.NEW_TASK
+
+    @log_function_entry_exit()
+    def _update_task_state_file(self, next_state: EESSITaskState, branch_name: str = None) -> Optional[Dict]:
+        """Update the TaskState file content in default or given branch"""
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+
+        task_pointer_file = self.description.task_object.remote_file_path
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_name)
+        task_state_file_path = f"{pull_request_dir}/TaskState"
+        arch = self.description.get_metadata_filename_components()[3]
+        commit_message = f"change task state to {next_state} in {branch_name} for {arch}"
+        result = self._update_file(task_state_file_path,
+                                   f"{next_state.name}\n",
+                                   commit_message,
+                                   branch_name=branch_name)
+        return result
+
+    @log_function_entry_exit()
+    def _init_payload_object(self):
+        """Initialize the payload object"""
+        if self.payload is not None:
+            log_message(LoggingScope.TASK_OPS, "INFO", "payload object already initialized")
+            return
+
+        # get name of of payload from metadata
+        payload_name = self.description.metadata["payload"]["filename"]
+        log_message(LoggingScope.TASK_OPS, "INFO", "payload_name: '%s'", payload_name)
+
+        # get config and remote_client from self.description.task_object
+        config = self.description.task_object.config
+        remote_client = self.description.task_object.remote_client
+
+        # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object
+        #   with payload_name
+        description_remote_file_path = self.description.task_object.remote_file_path
+        payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name)
+        log_message(LoggingScope.TASK_OPS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path)
+
+        # initialize payload object
+        payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
+        self.payload = EESSITaskPayload(payload_object)
+        log_message(LoggingScope.TASK_OPS, "INFO", "payload: '%s'", self.payload)
+
+    @log_function_entry_exit()
+    def _handle_add_new_task(self):
+        """Handler for ADD action in NEW_TASK state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in NEW_TASK state: '%s'",
+                    self.description.get_task_file_name())
+        # determine next state
+        next_state = self._next_state(EESSITaskState.NEW_TASK)
+        log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s'", next_state)
+
+        # initialize payload object
+        self._init_payload_object()
+
+        # update TaskState file content
+        self._update_task_state_file(next_state)
+
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
+        #   is still open or yet to be created); if it is not valid, perform corrective actions
+        return next_state
+
+    @log_function_entry_exit()
+    def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
+        """
+        Find the single PR for the given branch in any state.
+
+        Args:
+            repo: GitHub repository
+            branch_name: Name of the branch
+
+        Returns:
+            PullRequest object if found, None otherwise
+        """
+        try:
+            prs = [pr for pr in list(self.git_repo.get_pulls(state="all"))
+                   if pr.head.ref == branch_name]
+            log_message(LoggingScope.TASK_OPS, "INFO", "number of PRs found: %d", len(prs))
+            if len(prs):
+                log_message(LoggingScope.TASK_OPS, "INFO", "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref)
+            return prs[0] if prs else None
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Error finding PR for branch '%s': '%s'", branch_name, err)
+            return None
+
+    @log_function_entry_exit()
+    def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullRequest]:
+        """Find the PR for the given sequence number"""
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+
+        # list all PRs with head_ref starting with the feature branch name without the sequence number
+        last_dash = feature_branch_name.rfind("-")
+        if last_dash != -1:
+            head_ref_wout_seq_num = feature_branch_name[:last_dash + 1]  # +1 to include the separator
+        else:
+            head_ref_wout_seq_num = feature_branch_name
+
+        log_message(LoggingScope.TASK_OPS, "INFO",
+                    "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num)
+
+        all_prs = [pr for pr in list(self.git_repo.get_pulls(state="all"))
+                   if pr.head.ref.startswith(head_ref_wout_seq_num)]
+        log_message(LoggingScope.TASK_OPS, "INFO", "  number of PRs found: %d", len(all_prs))
+        for pr in all_prs:
+            log_message(LoggingScope.TASK_OPS, "INFO", "  PR #%d: '%s'", pr.number, pr.head.ref)
+
+        # now, find the PR for the feature branch name (if any)
+        log_message(LoggingScope.TASK_OPS, "INFO",
+                    "searching PR for feature branch name: '%s'", feature_branch_name)
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, "INFO", "pull request for branch '%s': '%s'",
+                    feature_branch_name, pull_request)
+        return pull_request
+
+    @log_function_entry_exit()
+    def _determine_sequence_number_from_pull_request_directory(self) -> int:
+        """Determine the sequence number from the pull request directory name"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        _, _, _, seq, _ = pull_request_dir.split("/")
+        return int(seq)
+
+    @log_function_entry_exit()
+    def _determine_feature_branch_name(self) -> str:
+        """Determine the feature branch name from the pull request directory name"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        org, repo, pr, seq, _ = pull_request_dir.split("/")
+        return f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
+
+    @log_function_entry_exit()
+    def _sync_task_state_file(self, source_branch: str, target_branch: str):
+        """Update task state file from source to target branch"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        task_state_file_path = f"{pull_request_dir}/TaskState"
+
+        try:
+            # get content from source branch
+            source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch)
+
+            # get current file in target branch
+            target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch)
+
+            # update if content is different
+            if source_content.sha != target_file.sha:
+                result = self.git_repo.update_file(
+                    path=task_state_file_path,
+                    message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}",
+                    content=source_content.decoded_content,
+                    sha=target_file.sha,
+                    branch=target_branch
+                )
+                log_message(LoggingScope.TASK_OPS, "INFO", "Updated '%s'", task_state_file_path)
+                return result
+            else:
+                log_message(LoggingScope.TASK_OPS, "INFO", "No changes needed for '%s'", task_state_file_path)
+                return None
+
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR", "Error syncing task state file: '%s'", err)
+            return None
+
+    @log_function_entry_exit()
+    def _update_task_states(self, next_state: EESSITaskState, default_branch_name: str,
+                            approved_state: EESSITaskState, feature_branch_name: str):
+        """
+        Update task states in default and feature branches
+
+        States have to be updated in a specific order and in particular the default branch has to be
+        merged into the feature branch before the feature branch can be updated to avoid a merge conflict.
+
+        Args:
+            next_state: next state to be applied to the default branch
+            default_branch_name: name of the default branch
+            approved_state: state to be applied to the feature branch
+            feature_branch_name: name of the feature branch
+        """
+        # TODO: add failure handling (capture failures and return them somehow)
+
+        # update TaskState file content
+        # - next_state in default branch (interpreted as current state)
+        # - approved_state in feature branch (interpreted as future state, ie, after
+        #   the PR corresponding to the feature branch will be merged)
+
+        # first, update the task state file in the default branch
+        self._update_task_state_file(next_state, branch_name=default_branch_name)
+
+        # second, merge default branch into feature branch (to avoid a merge conflict)
+        # TODO: store arch info (CPU+ACCEL) in task/metdata file and then access that rather
+        #       than using a part of the file name
+        arch = self.description.get_metadata_filename_components()[3]
+        commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}"
+        self.git_repo.merge(
+            head=default_branch_name,
+            base=feature_branch_name,
+            commit_message=commit_message
+        )
+
+        # last, update task state file in feature branch
+        self._update_task_state_file(approved_state, branch_name=feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, "INFO",
+                    "TaskState file updated to '%s' in default branch '%s' and to '%s' in feature branch '%s'",
+                    next_state, default_branch_name, approved_state, feature_branch_name)
+
+    @log_function_entry_exit()
+    def _create_task_summary(self) -> str:
+        """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory."""
+
+        # determine task summary file path in feature branch on GitHub
+        feature_branch_name = self._determine_feature_branch_name()
+        pull_request_dir = self._determine_pull_request_dir(branch_name=feature_branch_name)
+        task_summary_file_path = f"{pull_request_dir}/TaskSummary.html"
+
+        # check if task summary file already exists in repo on GitHub
+        if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
+            log_message(LoggingScope.TASK_OPS, "INFO", "task summary file already exists: '%s'", task_summary_file_path)
+            task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+            # return task_summary.decoded_content
+            return task_summary
+
+        # create task summary
+        payload_name = self.description.metadata["payload"]["filename"]
+        payload_summary = self.payload.analyse_contents(self.config)
+        metadata_contents = self.description.get_contents()
+
+        task_summary = self.config["github"]["task_summary_payload_template"].format(
+            payload_name=payload_name,
+            metadata_contents=metadata_contents,
+            payload_overview=payload_summary
+        )
+
+        # create HTML file with task summary in REPO-PR-SEQ directory
+        # TODO: add failure handling (capture result and act on it)
+        task_file_name = self.description.get_task_file_name()
+        commit_message = f"create summary for {task_file_name} in {feature_branch_name}"
+        self._safe_create_file(task_summary_file_path, commit_message, task_summary,
+                               branch_name=feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, "INFO", "task summary file created: '%s'", task_summary_file_path)
+
+        # return task summary
+        return task_summary
+
+    @log_function_entry_exit()
+    def _create_pr_contents_overview(self) -> str:
+        """Create a contents overview for the pull request"""
+        # TODO: implement
+        feature_branch_name = self._determine_feature_branch_name()
+        task_pointer_file = self.description.task_object.remote_file_path
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, feature_branch_name)
+        pr_dir = os.path.dirname(pull_request_dir)
+        directories = self._list_directory_contents(pr_dir, feature_branch_name)
+        contents_overview = ""
+        if directories:
+            contents_overview += "\n"
+            for directory in directories:
+                task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html"
+                if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
+                    file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+                    task_summary = base64.b64decode(file_contents.content).decode("utf-8")
+                    contents_overview += f"{task_summary}\n"
+                else:
+                    contents_overview += f"Task summary file not found: {task_summary_file_path}\n"
+            contents_overview += "\n"
+        else:
+            contents_overview += "No tasks found in this PR\n"
+
+        print(f"contents_overview: {contents_overview}")
+        return contents_overview
+
+    @log_function_entry_exit()
+    def _create_pull_request(self, feature_branch_name: str, default_branch_name: str):
+        """
+        Create a PR from the feature branch to the default branch
+
+        Args:
+            feature_branch_name: name of the feature branch
+            default_branch_name: name of the default branch
+        """
+        pr_title_format = self.config["github"]["grouped_pr_title"]
+        pr_body_format = self.config["github"]["grouped_pr_body"]
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
+        seq_num = self._determine_sequence_number_from_pull_request_directory()
+        pr_title = pr_title_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            repo=repo_name,
+            seq_num=seq_num,
+        )
+        self._create_task_summary()
+        contents_overview = self._create_pr_contents_overview()
+        pr_body = pr_body_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            pr_url=pr_url,
+            repo=repo_name,
+            seq_num=seq_num,
+            contents=contents_overview,
+            analysis="<dd>TO BE DONE</dd>",
+            action="<dd>TO BE DONE</dd>",
+        )
+        pr = self.git_repo.create_pull(
+            title=pr_title,
+            body=pr_body,
+            head=feature_branch_name,
+            base=default_branch_name
+        )
+        log_message(LoggingScope.TASK_OPS, "INFO", "PR created: '%s'", pr)
+
+    @log_function_entry_exit()
+    def _update_pull_request(self, pull_request: PullRequest):
+        """
+        Update the pull request
+
+        Args:
+            pull_request: instance of the pull request
+        """
+        # TODO: update sections (contents analysis, action)
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
+        seq_num = self._determine_sequence_number_from_pull_request_directory()
+
+        self._create_task_summary()
+        contents_overview = self._create_pr_contents_overview()
+        pr_body_format = self.config["github"]["grouped_pr_body"]
+        pr_body = pr_body_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            pr_url=pr_url,
+            repo=repo_name,
+            seq_num=seq_num,
+            contents=contents_overview,
+            analysis="<dd>TO BE DONE</dd>",
+            action="<dd>TO BE DONE</dd>",
+        )
+        pull_request.edit(body=pr_body)
+
+        log_message(LoggingScope.TASK_OPS, "INFO", "PR updated: '%s'", pull_request)
+
+    @log_function_entry_exit()
+    def _handle_add_payload_staged(self):
+        """Handler for ADD action in PAYLOAD_STAGED state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in PAYLOAD_STAGED state: '%s'",
+                    self.description.get_task_file_name())
+        next_state = self._next_state(EESSITaskState.PAYLOAD_STAGED)
+        approved_state = EESSITaskState.APPROVED
+        log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s', approved_state: '%s'", next_state, approved_state)
+
+        default_branch_name = self.git_repo.default_branch
+        default_branch = self._get_branch_from_name(default_branch_name)
+        default_sha = default_branch.commit.sha
+        feature_branch_name = self._determine_feature_branch_name()
+        feature_branch = self._get_branch_from_name(feature_branch_name)
+        if not feature_branch:
+            # feature branch does not exist
+            # TODO: could have been merged already --> check if PR corresponding to the feature branch exists
+            # ASSUME: it has not existed before --> create it
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "branch '%s' does not exist, creating it", feature_branch_name)
+
+            feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha)
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "branch '%s' created: '%s'", feature_branch_name, feature_branch)
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "found existing branch for '%s': '%s'", feature_branch_name, feature_branch)
+
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        if not pull_request:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "no PR found for branch '%s'", feature_branch_name)
+
+            # TODO: add failure handling (capture result and act on it)
+            self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
+
+            # TODO: add failure handling (capture result and act on it)
+            self._create_pull_request(feature_branch_name, default_branch_name)
+
+            return EESSITaskState.PULL_REQUEST
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "found existing PR for branch '%s': '%s'", feature_branch_name, pull_request)
+            # TODO: check if PR is open or closed
+            if pull_request.state == "closed":
+                log_message(LoggingScope.TASK_OPS, "INFO",
+                            "PR '%s' is closed, creating issue", pull_request)
+                # TODO: create issue
+                return EESSITaskState.PAYLOAD_STAGED
+            else:
+                log_message(LoggingScope.TASK_OPS, "INFO",
+                            "PR '%s' is open, updating task states", pull_request)
+                # TODO: add failure handling (capture result and act on it)
+                #   THINK about what a failure would mean and what to do about it.
+                self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
+
+                # TODO: add failure handling (capture result and act on it)
+                self._update_pull_request(pull_request)
+
+                return EESSITaskState.PULL_REQUEST
+
+    @log_function_entry_exit()
+    def _handle_add_pull_request(self):
+        """Handler for ADD action in PULL_REQUEST state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in PULL_REQUEST state: '%s'",
+                    self.description.get_task_file_name())
+        # Implementation for adding in PULL_REQUEST state
+        # we got here because the state of the task is PULL_REQUEST in the default branch
+        # determine branch and PR and state of PR
+        # PR is open --> just return EESSITaskState.PULL_REQUEST
+        # PR is closed & merged --> deployment is approved
+        # PR is closed & not merged --> deployment is rejected
+        feature_branch_name = self._determine_feature_branch_name()
+        # TODO: check if feature branch exists, for now ASSUME it does
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        if pull_request:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "found PR for branch '%s': '%s'", feature_branch_name, pull_request)
+            if pull_request.state == "closed":
+                if pull_request.merged:
+                    log_message(LoggingScope.TASK_OPS, "INFO",
+                                "PR '%s' is closed and merged, returning APPROVED state", pull_request)
+                    # TODO: How could we ended up here? state in default branch is PULL_REQUEST but
+                    #         PR is merged, hence it should have been in the APPROVED state
+                    #    ==> for now, just return EESSITaskState.PULL_REQUEST
+                    #
+                    #       there is the possibility that the PR was updated just before the
+                    #         PR was merged
+                    #       WHY is it a problem? because a task may have been accepted that wouldn't
+                    #         have been accepted or worse shouldn't been accepted
+                    #       WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw
+                    #       HOWEVER, the contents of the PR directory may be inconsistent with
+                    #         respect to the TaskState file and missing TaskSummary.html file
+                    #       WE could create an issue and only return EESSITaskState.APPROVED if the
+                    #         issue is closed
+                    #       WE could also defer all handling of this to the handler for the
+                    #         APPROVED state
+                    # NOPE, we have to do some handling here, at least for the tasks where their
+                    #   state file did
+                    #   --> check if we could have ended up here? If so, create an issue.
+                    #       Do we need a state ISSUE_OPENED to avoid processing the task again?
+                    return EESSITaskState.PULL_REQUEST
+                else:
+                    log_message(LoggingScope.TASK_OPS, "INFO",
+                                "PR '%s' is closed and not merged, returning REJECTED state", pull_request)
+                    # TODO: there is the possibility that the PR was updated just before the
+                    #         PR was closed
+                    #       WHY is it a problem? because a task may have been rejected that wouldn't
+                    #         have been rejected or worse shouldn't been rejected
+                    #       WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw
+                    #       HOWEVER, the contents of the PR directory may be inconsistent with
+                    #         respect to the TaskState file and missing TaskSummary.html file
+                    #       WE could create an issue and only return EESSITaskState.REJECTED if the
+                    #         issue is closed
+                    #       WE could also defer all handling of this to the handler for the
+                    #         REJECTED state
+                    # FOR NOW, we assume that the task was rejected on purpose
+                    #   we need to change the state of the task in the default branch to REJECTED
+                    self._update_task_state_file(EESSITaskState.REJECTED)
+                    return EESSITaskState.REJECTED
+            else:
+                log_message(LoggingScope.TASK_OPS, "INFO",
+                            "PR '%s' is open, returning PULL_REQUEST state", pull_request)
+                return EESSITaskState.PULL_REQUEST
+        else:
+            log_message(LoggingScope.TASK_OPS, "INFO",
+                        "no PR found for branch '%s'", feature_branch_name)
+            # the method was called because the state of the task is PULL_REQUEST in the default branch
+            # however, it's weird that the PR was not found for the feature branch
+            # TODO: may create or update an issue for the task or deployment
+            return EESSITaskState.PULL_REQUEST
+
+        return EESSITaskState.PULL_REQUEST
+
+    @log_function_entry_exit()
+    def _perform_task_action(self) -> bool:
+        """Perform the task action"""
+        # TODO: support other actions than ADD
+        if self.action == EESSITaskAction.ADD:
+            return self._perform_task_add()
+        else:
+            raise ValueError(f"Task action '{self.action}' not supported (yet)")
+
+    @log_function_entry_exit()
+    def _issue_exists(self, title: str, state: str = "open") -> bool:
+        """
+        Check if an issue with the given title and state already exists.
+        """
+        issues = self.git_repo.get_issues(state=state)
+        for issue in issues:
+            if issue.title == title and issue.state == state:
+                return True
+        else:
+            return False
+
+    @log_function_entry_exit()
+    def _perform_task_add(self) -> bool:
+        """Perform the ADD task action"""
+        # TODO: verify checksum here or before?
+        script = self.config["paths"]["ingestion_script"]
+        sudo = ["sudo"] if self.config["cvmfs"].getboolean("ingest_as_root", True) else []
+        log_message(LoggingScope.STATE_OPS, "INFO",
+                    "Running the ingestion script for '%s'...\n  with script: '%s'\n  with sudo: '%s'",
+                    self.description.get_task_file_name(),
+                    script, "no" if sudo == [] else "yes")
+        ingest_cmd = subprocess.run(
+            sudo + [script, self.cvmfs_repo, str(self.payload.payload_object.local_file_path)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        log_message(LoggingScope.STATE_OPS, "INFO",
+                    "Ingestion script returned code '%s'", ingest_cmd.returncode)
+        log_message(LoggingScope.STATE_OPS, "INFO",
+                    "Ingestion script stdout: '%s'", ingest_cmd.stdout.decode("UTF-8"))
+        log_message(LoggingScope.STATE_OPS, "INFO",
+                    "Ingestion script stderr: '%s'", ingest_cmd.stderr.decode("UTF-8"))
+        if ingest_cmd.returncode == 0:
+            next_state = self._next_state(EESSITaskState.APPROVED)
+            self._update_task_state_file(next_state)
+            if self.config.has_section("slack") and self.config["slack"].getboolean("ingestion_notification", False):
+                send_slack_message(
+                    self.config["secrets"]["slack_webhook"],
+                    self.config["slack"]["ingestion_message"].format(
+                        tarball=os.path.basename(self.payload.payload_object.local_file_path),
+                        cvmfs_repo=self.cvmfs_repo)
+                )
+            return True
+        else:
+            tarball = os.path.basename(self.payload.payload_object.local_file_path)
+            log_message(LoggingScope.STATE_OPS, "ERROR",
+                        "Failed to add '%s', return code '%s'",
+                        tarball,
+                        ingest_cmd.returncode)
+
+            issue_title = f"Failed to add '{tarball}'"
+            log_message(LoggingScope.STATE_OPS, "INFO",
+                        "Creating issue for failed ingestion: title: '%s'",
+                        issue_title)
+
+            command = " ".join(ingest_cmd.args)
+            failed_ingestion_issue_body = self.config["github"]["failed_ingestion_issue_body"]
+            issue_body = failed_ingestion_issue_body.format(
+                command=command,
+                tarball=tarball,
+                return_code=ingest_cmd.returncode,
+                stdout=ingest_cmd.stdout.decode("UTF-8"),
+                stderr=ingest_cmd.stderr.decode("UTF-8")
+            )
+            log_message(LoggingScope.STATE_OPS, "INFO",
+                        "Creating issue for failed ingestion: body: '%s'",
+                        issue_body)
+
+            if self._issue_exists(issue_title, state="open"):
+                log_message(LoggingScope.STATE_OPS, "INFO",
+                            "Failed to add '%s', but an open issue already exists, skipping...",
+                            os.path.basename(self.payload.payload_object.local_file_path))
+            else:
+                log_message(LoggingScope.STATE_OPS, "INFO",
+                            "Failed to add '%s', but an open issue does not exist, creating one...",
+                            os.path.basename(self.payload.payload_object.local_file_path))
+                self.git_repo.create_issue(title=issue_title, body=issue_body)
+            return False
+
+    @log_function_entry_exit()
+    def _handle_add_approved(self):
+        """Handler for ADD action in APPROVED state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in APPROVED state: '%s'",
+                    self.description.get_task_file_name())
+        # Implementation for adding in APPROVED state
+        # If successful, _perform_task_action() will change the state
+        #   to INGESTED on GitHub
+        try:
+            if self._perform_task_action():
+                return EESSITaskState.INGESTED
+            else:
+                return EESSITaskState.APPROVED
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, "ERROR",
+                        "Error performing task action: '%s'\nTraceback:\n%s", err, traceback.format_exc())
+            return EESSITaskState.APPROVED
+
+    @log_function_entry_exit()
+    def _handle_add_ingested(self):
+        """Handler for ADD action in INGESTED state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in INGESTED state: '%s'",
+                    self.description.get_task_file_name())
+        # Implementation for adding in INGESTED state
+        # DONT change state on GitHub, because the result
+        #   (INGESTED/REJECTED) would be overwritten
+        return EESSITaskState.DONE
+
+    @log_function_entry_exit()
+    def _handle_add_rejected(self):
+        """Handler for ADD action in REJECTED state"""
+        log_message(LoggingScope.TASK_OPS, "INFO", "Handling ADD action in REJECTED state: '%s'",
+                    self.description.get_task_file_name())
+        # Implementation for adding in REJECTED state
+        # DONT change state on GitHub, because the result
+        #   (INGESTED/REJECTED) would be overwritten
+        return EESSITaskState.DONE
+
+    @log_function_entry_exit()
+    def __str__(self):
+        return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})"
diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
index 0662fc83..15fb416c 100644
--- a/scripts/automated_ingestion/ingest_bundles.py
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from eessi_data_object import EESSIDataAndSignatureObject
-# from eessi_task import EESSITask, TaskState
+from eessi_task import EESSITask
 from eessi_task_description import EESSITaskDescription
 from eessi_s3_bucket import EESSIS3Bucket
 from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes
@@ -10,10 +10,9 @@
 
 import argparse
 import configparser
-# import github
+import github
 import json
 import logging
-# import os
 import sys
 from pathlib import Path
 from typing import List
@@ -152,8 +151,8 @@ def main():
 
     # TODO: check configuration: secrets, paths, permissions on dirs, etc
     extensions = args.extensions.split(",")
-    # gh_pat = config["secrets"]["github_pat"]
-    # gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"])
+    gh_pat = config["secrets"]["github_pat"]
+    gh_staging_repo = github.Github(gh_pat).get_repo(config["github"]["staging_repo"])
 
     buckets = json.loads(config["aws"]["staging_buckets"])
     for bucket, cvmfs_repo in buckets.items():
@@ -171,21 +170,20 @@ def main():
                 log_message(LoggingScope.GROUP_OPS, "INFO", "Processing task: '%s'", task_path)
 
                 try:
-                    _ = EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket))
-#                    # create EESSITask for the task file
-#                    try:
-#                        task = EESSITask(
-#                            EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)),
-#                            config, cvmfs_repo, gh_staging_repo
-#                        )
-#
-#                    except Exception as err:
-#                        log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task %s: %s",
-#                                    task_path, str(err))
-#                        continue
-#
-#                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task)
-#
+                    # create EESSITask for the task file
+                    try:
+                        task = EESSITask(
+                            EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)),
+                            config, cvmfs_repo, gh_staging_repo
+                        )
+
+                    except Exception as err:
+                        log_message(LoggingScope.ERROR, "ERROR", "Failed to create EESSITask for task '%s': '%s'",
+                                    task_path, str(err))
+                        continue
+
+                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task)
+
 #                    previous_state = None
 #                    current_state = task.determine_state()
 #                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'",

From 530ac6dd769d121b070ca25127141dfcabc3614f Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 20:33:08 +0200
Subject: [PATCH 19/26] add TASK_OPS_DETAILS log scope and reduce log noise

---
 scripts/automated_ingestion/eessi_logging.py  |  21 +--
 scripts/automated_ingestion/eessi_task.py     | 120 +++++++++---------
 scripts/automated_ingestion/ingest_bundles.py |   2 +-
 3 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_logging.py b/scripts/automated_ingestion/eessi_logging.py
index 857d92ca..f94c8495 100644
--- a/scripts/automated_ingestion/eessi_logging.py
+++ b/scripts/automated_ingestion/eessi_logging.py
@@ -21,17 +21,18 @@
 class LoggingScope(IntFlag):
     """Enumeration of different logging scopes."""
     NONE = 0
-    FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
-    DOWNLOAD = auto()         # Logging related to file downloads
-    VERIFICATION = auto()     # Logging related to signature and checksum verification
-    STATE_OPS = auto()        # Logging related to tarball state operations
-    GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
-    GROUP_OPS = auto()        # Logging related to tarball group operations
-    TASK_OPS = auto()         # Logging related to task operations
-    ERROR = auto()            # Error logging (separate from other scopes for easier filtering)
-    DEBUG = auto()            # Debug-level logging (separate from other scopes for easier filtering)
+    FUNC_ENTRY_EXIT = auto()   # Function entry/exit logging
+    DOWNLOAD = auto()          # Logging related to file downloads
+    VERIFICATION = auto()      # Logging related to signature and checksum verification
+    STATE_OPS = auto()         # Logging related to tarball state operations
+    GITHUB_OPS = auto()        # Logging related to GitHub operations (PRs, issues, etc.)
+    GROUP_OPS = auto()         # Logging related to tarball group operations
+    TASK_OPS = auto()          # Logging related to task operations
+    TASK_OPS_DETAILS = auto()  # Logging related to task operations (detailed)
+    ERROR = auto()             # Error logging (separate from other scopes for easier filtering)
+    DEBUG = auto()             # Debug-level logging (separate from other scopes for easier filtering)
     ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS |
-           GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG)
+           GITHUB_OPS | GROUP_OPS | TASK_OPS | TASK_OPS_DETAILS | ERROR | DEBUG)
 
 
 # Global setting for logging scopes
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index f2369db6..da7e49e6 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -34,15 +34,15 @@ class EESSITaskState(Enum):
     def from_string(
         cls, name: str, default: Optional["EESSITaskState"] = None, case_sensitive: bool = False
     ) -> "EESSITaskState":
-        log_message(LoggingScope.TASK_OPS, "INFO", "from_string: '%s'", name)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string: '%s'", name)
         if case_sensitive:
             to_return = cls.__members__.get(name, default)
-            log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string will return: '%s'", to_return)
             return to_return
 
         try:
             to_return = cls[name.upper()]
-            log_message(LoggingScope.TASK_OPS, "INFO", "from_string will return: '%s'", to_return)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "from_string will return: '%s'", to_return)
             return to_return
         except KeyError:
             return default
@@ -134,32 +134,32 @@ def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, b
             # get all files in directory part of file_path_prefix
             directory_part = os.path.dirname(file_path_prefix)
             files = self.git_repo.get_contents(directory_part, ref=branch_name)
-            log_msg = "Found files %s in directory %s in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name)
+            log_msg = "Found files '%s' in directory '%s' in branch '%s'"
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", log_msg, files, directory_part, branch_name)
             # check if any of the files has file_path_prefix as prefix
             for file in files:
                 if file.path.startswith(file_path_prefix):
-                    log_msg = "Found file %s in directory %s in branch %s"
-                    log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name)
+                    log_msg = "Found file '%s' in directory '%s' in branch '%s'"
+                    log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", log_msg, file.path, directory_part, branch_name)
                     return True
-            log_msg = "No file with prefix %s found in directory %s in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name)
+            log_msg = "No file with prefix '%s' found in directory '%s' in branch '%s'"
+            log_message(LoggingScope.TASK_OPS, "INFO", log_msg, file_path_prefix, directory_part, branch_name)
             return False
         except UnknownObjectException:
             # file_path does not exist in branch
-            log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
+            log_msg = "Directory '%s' or file with prefix '%s' does not exist in branch '%s'"
+            log_message(LoggingScope.TASK_OPS, "INFO", log_msg, directory_part, file_path_prefix, branch_name)
             return False
         except GithubException as err:
             if err.status == 404:
                 # file_path does not exist in branch
-                log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
-                log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
+                log_msg = "Directory '%s' or file with prefix '%s' does not exist in branch '%s'"
+                log_message(LoggingScope.TASK_OPS, "INFO", log_msg, directory_part, file_path_prefix, branch_name)
                 return False
             else:
                 # if there was some other (e.g. connection) issue, log message and return False
-                log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
-                log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
+                log_msg = "Unable to determine the state of '%s', the GitHub API returned status '%s'!"
+                log_message(LoggingScope.ERROR, "WARNING", log_msg, self.object, err.status)
                 return False
         return False
 
@@ -323,23 +323,24 @@ def _find_state(self) -> EESSITaskState:
             The state of the task.
         """
         # obtain repo and pr from metadata
-        log_message(LoggingScope.TASK_OPS, "INFO", "finding state of task '%s'", self.description.task_object)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "finding state of task '%s'", self.description.task_object)
         repo = self.description.get_repo_name()
         pr = self.description.get_pr_number()
-        log_message(LoggingScope.TASK_OPS, "INFO", "repo: '%s', pr: '%s'", repo, pr)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "repo: '%s', pr: '%s'", repo, pr)
 
         # obtain all sequence numbers in repo/pr dir which include a state file for this task
         sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
         if len(sequence_numbers) == 0:
             # no sequence numbers found, so we return NEW_TASK
-            log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, state: NEW_TASK")
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no sequence numbers found, state: NEW_TASK")
             return EESSITaskState.NEW_TASK
         # we got at least one sequence number
         # if one value for a sequence number is True, we can determine the state from the file in the directory
         sequence_including_task = [key for key, value in sequence_numbers.items() if value is True]
         if len(sequence_including_task) == 0:
             # no sequence number includes the task file, so we return NEW_TASK
-            log_message(LoggingScope.TASK_OPS, "INFO", "no sequence number includes the task file, state: NEW_TASK")
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
+                        "no sequence number includes the task file, state: NEW_TASK")
             return EESSITaskState.NEW_TASK
         # we got at least one sequence number which includes the task file
         # we can determine the state from the filename in the directory
@@ -349,7 +350,7 @@ def _find_state(self) -> EESSITaskState:
         task_file_name = self.description.get_task_file_name()
         metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}."
         state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number)
-        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
         return state
 
     @log_function_entry_exit()
@@ -377,7 +378,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
                     if file.path.startswith(metadata_file_state_path_prefix):
                         # get state from file name taking only the suffix
                         state = EESSITaskState.from_string(file.name.split(".")[-1])
-                        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+                        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
                         states.append(state)
         if len(states) == 0:
             # did not find any file with metadata_file_state_path_prefix as prefix
@@ -387,7 +388,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
         # sort the states and return the last one
         states.sort()
         state = states[-1]
-        log_message(LoggingScope.TASK_OPS, "INFO", "state: '%s'", state)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
         return state
 
     @log_function_entry_exit()
@@ -398,7 +399,7 @@ def _list_directory_contents(self, directory_path: str, branch_name: str = None)
         try:
             # Get contents of the directory
             branch_name = self.git_repo.default_branch if branch_name is None else branch_name
-            log_message(LoggingScope.TASK_OPS, "INFO", "listing contents of '%s' in branch '%s'",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "listing contents of '%s' in branch '%s'",
                         directory_path, branch_name)
             contents = self.git_repo.get_contents(directory_path, ref=branch_name)
 
@@ -524,15 +525,15 @@ def determine_state(self, branch: str = None) -> EESSITaskState:
         branch_to_use = self.git_repo.default_branch if branch is None else branch
 
         if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use):
-            log_message(LoggingScope.TASK_OPS, "INFO", "path '%s' exists in branch '%s'",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "path '%s' exists in branch '%s'",
                         task_pointer_file, branch_to_use)
 
             # get state from task file in branch to use
             # - read the EESSITaskState file in pull request directory
             pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use)
-            log_message(LoggingScope.TASK_OPS, "INFO", "pull request directory: '%s'", pull_request_dir)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request directory: '%s'", pull_request_dir)
             task_state_file_path = f"{pull_request_dir}/TaskState"
-            log_message(LoggingScope.TASK_OPS, "INFO", "task state file path: '%s'", task_state_file_path)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "task state file path: '%s'", task_state_file_path)
             task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
 
             log_message(LoggingScope.TASK_OPS, "INFO", "task state in branch '%s': %s",
@@ -573,7 +574,7 @@ def _safe_create_file(self, path: str, message: str, content: str, branch_name:
         try:
             branch_name = self.git_repo.default_branch if branch_name is None else branch_name
             existing_file = self.git_repo.get_contents(path, ref=branch_name)
-            log_message(LoggingScope.TASK_OPS, "INFO", "File '%s' already exists", path)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "File '%s' already exists", path)
             return existing_file
         except GithubException as err:
             if err.status == 404:  # File doesn't exist
@@ -656,7 +657,7 @@ def _update_file(
                 branch=branch_name
             )
 
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "File updated successfully. Commit SHA: '%s'", result["commit"].sha)
             return result
 
@@ -700,18 +701,20 @@ def _determine_sequence_number(self) -> int:
         """Determine the sequence number for the task"""
 
         sequence_numbers = self._sorted_list_of_sequence_numbers()
-        log_message(LoggingScope.TASK_OPS, "INFO", "number of sequence numbers: %d", len(sequence_numbers))
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "number of sequence numbers: %d", len(sequence_numbers))
         if len(sequence_numbers) == 0:
+            log_message(LoggingScope.TASK_OPS, "INFO", "no sequence numbers found, returning 0")
             return 0
 
-        log_message(LoggingScope.TASK_OPS, "INFO", "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers)))
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
+                    "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers)))
 
         # get the highest sequence number
         highest_sequence_number = sequence_numbers[-1]
-        log_message(LoggingScope.TASK_OPS, "INFO", "highest sequence number: %d", highest_sequence_number)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "highest sequence number: %d", highest_sequence_number)
 
         pull_request = self._find_pr_for_sequence_number(highest_sequence_number)
-        log_message(LoggingScope.TASK_OPS, "INFO", "pull request: '%s'", pull_request)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request: '%s'", pull_request)
 
         if pull_request is None:
             log_message(LoggingScope.TASK_OPS, "INFO", "Did not find pull request for sequence number %d",
@@ -773,7 +776,7 @@ def _handle_add_undetermined(self):
                 f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
                 branch_name=branch_name
             )
-            log_message(LoggingScope.TASK_OPS, "INFO", "commit created: '%s'", commit)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "commit created: '%s'", commit)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, "ERROR", "Error creating commit: '%s'", err)
             # TODO: rollback previous changes (task description file, task state file)
@@ -803,12 +806,12 @@ def _update_task_state_file(self, next_state: EESSITaskState, branch_name: str =
     def _init_payload_object(self):
         """Initialize the payload object"""
         if self.payload is not None:
-            log_message(LoggingScope.TASK_OPS, "INFO", "payload object already initialized")
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload object already initialized")
             return
 
         # get name of of payload from metadata
         payload_name = self.description.metadata["payload"]["filename"]
-        log_message(LoggingScope.TASK_OPS, "INFO", "payload_name: '%s'", payload_name)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload_name: '%s'", payload_name)
 
         # get config and remote_client from self.description.task_object
         config = self.description.task_object.config
@@ -818,7 +821,7 @@ def _init_payload_object(self):
         #   with payload_name
         description_remote_file_path = self.description.task_object.remote_file_path
         payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name)
-        log_message(LoggingScope.TASK_OPS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "payload_remote_file_path: '%s'", payload_remote_file_path)
 
         # initialize payload object
         payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
@@ -832,7 +835,7 @@ def _handle_add_new_task(self):
                     self.description.get_task_file_name())
         # determine next state
         next_state = self._next_state(EESSITaskState.NEW_TASK)
-        log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s'", next_state)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "next_state: '%s'", next_state)
 
         # initialize payload object
         self._init_payload_object()
@@ -859,9 +862,10 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         try:
             prs = [pr for pr in list(self.git_repo.get_pulls(state="all"))
                    if pr.head.ref == branch_name]
-            log_message(LoggingScope.TASK_OPS, "INFO", "number of PRs found: %d", len(prs))
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "number of PRs found: %d", len(prs))
             if len(prs):
-                log_message(LoggingScope.TASK_OPS, "INFO", "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref)
+                log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
+                            "1st PR found: %d, '%s'", prs[0].number, prs[0].head.ref)
             return prs[0] if prs else None
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, "ERROR", "Error finding PR for branch '%s': '%s'", branch_name, err)
@@ -881,20 +885,20 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq
         else:
             head_ref_wout_seq_num = feature_branch_name
 
-        log_message(LoggingScope.TASK_OPS, "INFO",
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                     "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num)
 
         all_prs = [pr for pr in list(self.git_repo.get_pulls(state="all"))
                    if pr.head.ref.startswith(head_ref_wout_seq_num)]
-        log_message(LoggingScope.TASK_OPS, "INFO", "  number of PRs found: %d", len(all_prs))
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "  number of PRs found: %d", len(all_prs))
         for pr in all_prs:
-            log_message(LoggingScope.TASK_OPS, "INFO", "  PR #%d: '%s'", pr.number, pr.head.ref)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "  PR #%d: '%s'", pr.number, pr.head.ref)
 
         # now, find the PR for the feature branch name (if any)
-        log_message(LoggingScope.TASK_OPS, "INFO",
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                     "searching PR for feature branch name: '%s'", feature_branch_name)
         pull_request = self._find_pr_for_branch(feature_branch_name)
-        log_message(LoggingScope.TASK_OPS, "INFO", "pull request for branch '%s': '%s'",
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "pull request for branch '%s': '%s'",
                     feature_branch_name, pull_request)
         return pull_request
 
@@ -1002,7 +1006,8 @@ def _create_task_summary(self) -> str:
 
         # check if task summary file already exists in repo on GitHub
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
-            log_message(LoggingScope.TASK_OPS, "INFO", "task summary file already exists: '%s'", task_summary_file_path)
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
+                        "task summary file already exists: '%s'", task_summary_file_path)
             task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
             # return task_summary.decoded_content
             return task_summary
@@ -1024,7 +1029,7 @@ def _create_task_summary(self) -> str:
         commit_message = f"create summary for {task_file_name} in {feature_branch_name}"
         self._safe_create_file(task_summary_file_path, commit_message, task_summary,
                                branch_name=feature_branch_name)
-        log_message(LoggingScope.TASK_OPS, "INFO", "task summary file created: '%s'", task_summary_file_path)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "task summary file created: '%s'", task_summary_file_path)
 
         # return task summary
         return task_summary
@@ -1135,7 +1140,8 @@ def _handle_add_payload_staged(self):
                     self.description.get_task_file_name())
         next_state = self._next_state(EESSITaskState.PAYLOAD_STAGED)
         approved_state = EESSITaskState.APPROVED
-        log_message(LoggingScope.TASK_OPS, "INFO", "next_state: '%s', approved_state: '%s'", next_state, approved_state)
+        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
+                    "next_state: '%s', approved_state: '%s'", next_state, approved_state)
 
         default_branch_name = self.git_repo.default_branch
         default_branch = self._get_branch_from_name(default_branch_name)
@@ -1146,19 +1152,19 @@ def _handle_add_payload_staged(self):
             # feature branch does not exist
             # TODO: could have been merged already --> check if PR corresponding to the feature branch exists
             # ASSUME: it has not existed before --> create it
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "branch '%s' does not exist, creating it", feature_branch_name)
 
             feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha)
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "branch '%s' created: '%s'", feature_branch_name, feature_branch)
         else:
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "found existing branch for '%s': '%s'", feature_branch_name, feature_branch)
 
         pull_request = self._find_pr_for_branch(feature_branch_name)
         if not pull_request:
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "no PR found for branch '%s'", feature_branch_name)
 
             # TODO: add failure handling (capture result and act on it)
@@ -1169,7 +1175,7 @@ def _handle_add_payload_staged(self):
 
             return EESSITaskState.PULL_REQUEST
         else:
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "found existing PR for branch '%s': '%s'", feature_branch_name, pull_request)
             # TODO: check if PR is open or closed
             if pull_request.state == "closed":
@@ -1178,7 +1184,7 @@ def _handle_add_payload_staged(self):
                 # TODO: create issue
                 return EESSITaskState.PAYLOAD_STAGED
             else:
-                log_message(LoggingScope.TASK_OPS, "INFO",
+                log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                             "PR '%s' is open, updating task states", pull_request)
                 # TODO: add failure handling (capture result and act on it)
                 #   THINK about what a failure would mean and what to do about it.
@@ -1204,12 +1210,12 @@ def _handle_add_pull_request(self):
         # TODO: check if feature branch exists, for now ASSUME it does
         pull_request = self._find_pr_for_branch(feature_branch_name)
         if pull_request:
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "found PR for branch '%s': '%s'", feature_branch_name, pull_request)
             if pull_request.state == "closed":
                 if pull_request.merged:
                     log_message(LoggingScope.TASK_OPS, "INFO",
-                                "PR '%s' is closed and merged, returning APPROVED state", pull_request)
+                                "PR '%s' is closed and merged (strange that state is PULL_REQUEST)", pull_request)
                     # TODO: How could we ended up here? state in default branch is PULL_REQUEST but
                     #         PR is merged, hence it should have been in the APPROVED state
                     #    ==> for now, just return EESSITaskState.PULL_REQUEST
@@ -1249,11 +1255,11 @@ def _handle_add_pull_request(self):
                     self._update_task_state_file(EESSITaskState.REJECTED)
                     return EESSITaskState.REJECTED
             else:
-                log_message(LoggingScope.TASK_OPS, "INFO",
+                log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                             "PR '%s' is open, returning PULL_REQUEST state", pull_request)
                 return EESSITaskState.PULL_REQUEST
         else:
-            log_message(LoggingScope.TASK_OPS, "INFO",
+            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
                         "no PR found for branch '%s'", feature_branch_name)
             # the method was called because the state of the task is PULL_REQUEST in the default branch
             # however, it's weird that the PR was not found for the feature branch
diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
index 15fb416c..b4df5f2f 100644
--- a/scripts/automated_ingestion/ingest_bundles.py
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -182,7 +182,7 @@ def main():
                                     task_path, str(err))
                         continue
 
-                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task: %s", task)
+                    log_message(LoggingScope.GROUP_OPS, "INFO", "Created EESSITask: '%s'", task)
 
 #                    previous_state = None
 #                    current_state = task.determine_state()

From a945ece21e6c6e8ec40a150d2f5a15700e072b00 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 20:55:32 +0200
Subject: [PATCH 20/26] remove unused functions

---
 scripts/automated_ingestion/eessi_task.py | 205 ----------------------
 1 file changed, 205 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index da7e49e6..850c426a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -219,178 +219,6 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) ->
                 return {}
         return sequence_numbers
 
-    @log_function_entry_exit()
-    def _find_highest_number(self, str_list: List[str]) -> int:
-        """
-        Find the highest number in a list of strings.
-        """
-        # Convert all strings to integers
-        int_list = [int(num) for num in str_list]
-        return max(int_list)
-
-    @log_function_entry_exit()
-    def _get_sequence_number_for_task_file(self) -> int:
-        """
-        Get the sequence number this task is assigned to at the moment.
-        NOTE, should only be called if the task is actually assigned to a sequence number.
-        """
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
-        if len(sequence_numbers) == 0:
-            raise ValueError("Found no sequence numbers at all")
-        else:
-            # get all entries with value True, there should be only one, so we return the first one
-            sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True]
-            if len(sequence_numbers_true) == 0:
-                raise ValueError("Found no sequence numbers that include the task file for task %s",
-                                 self.description)
-            else:
-                return sequence_numbers_true[0]
-
-    @log_function_entry_exit()
-    def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int:
-        """
-        Get the current sequence number based on the sequence numbers.
-        If sequence_numbers is not provided, we determine the sequence numbers from the task description.
-        """
-        if sequence_numbers is None:
-            repo_name = self.description.get_repo_name()
-            pr_number = self.description.get_pr_number()
-            sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
-        if len(sequence_numbers) == 0:
-            return 0
-        return self._find_highest_number(sequence_numbers.keys())
-
-    @log_function_entry_exit()
-    def _get_fixed_sequence_number(self) -> int:
-        """
-        Get a fixed sequence number.
-        """
-        return 11
-
-    @log_function_entry_exit()
-    def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]:
-        """
-        Find the staging PR for the task.
-        TODO: arg sequence number --> make function simpler
-        """
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        try:
-            sequence_number = self._get_sequence_number_for_task_file()
-        except ValueError:
-            # no sequence number found, so we return None
-            log_message(LoggingScope.ERROR, "ERROR", "no sequence number found for task '%s'", self.description)
-            return None, None, None
-        except Exception as err:
-            # some other error
-            log_message(LoggingScope.ERROR, "ERROR", "error finding staging PR for task '%s': '%s'",
-                        self.description, err)
-            return None, None, None
-        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-        if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
-            find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state="all")]
-            if find_pr:
-                pr = find_pr.pop(0)
-                return pr, branch_name, sequence_number
-            else:
-                return None, branch_name, sequence_number
-        else:
-            return None, None, None
-
-    @log_function_entry_exit()
-    def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]:
-        """
-        Create a staging PR for the task.
-        NOTE, SHALL only be called if no staging PR for the task exists yet.
-        """
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-        default_branch_name = self.git_repo.default_branch
-        pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                       body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                       head=branch_name, base=default_branch_name)
-        return pr, branch_name
-
-    @log_function_entry_exit()
-    def _find_state(self) -> EESSITaskState:
-        """
-        Determine the state of the task based on the task description metadata.
-
-        Returns:
-            The state of the task.
-        """
-        # obtain repo and pr from metadata
-        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "finding state of task '%s'", self.description.task_object)
-        repo = self.description.get_repo_name()
-        pr = self.description.get_pr_number()
-        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "repo: '%s', pr: '%s'", repo, pr)
-
-        # obtain all sequence numbers in repo/pr dir which include a state file for this task
-        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
-        if len(sequence_numbers) == 0:
-            # no sequence numbers found, so we return NEW_TASK
-            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "no sequence numbers found, state: NEW_TASK")
-            return EESSITaskState.NEW_TASK
-        # we got at least one sequence number
-        # if one value for a sequence number is True, we can determine the state from the file in the directory
-        sequence_including_task = [key for key, value in sequence_numbers.items() if value is True]
-        if len(sequence_including_task) == 0:
-            # no sequence number includes the task file, so we return NEW_TASK
-            log_message(LoggingScope.TASK_OPS_DETAILS, "INFO",
-                        "no sequence number includes the task file, state: NEW_TASK")
-            return EESSITaskState.NEW_TASK
-        # we got at least one sequence number which includes the task file
-        # we can determine the state from the filename in the directory
-        # NOTE, we use the first element in sequence_including_task (there should be only one)
-        #     we ignore other elements in sequence_including_task
-        sequence_number = sequence_including_task[0]
-        task_file_name = self.description.get_task_file_name()
-        metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}."
-        state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number)
-        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
-        return state
-
-    @log_function_entry_exit()
-    def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str,
-                                            sequence_number: int) -> EESSITaskState:
-        """
-        Get the state from the file in the metadata_file_state_path_prefix.
-        """
-        # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED,
-        # INGESTED, DONE)
-        # we need to check the task file in the default branch or in the branch corresponding to the sequence number
-        directory_part = os.path.dirname(metadata_file_state_path_prefix)
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        default_branch_name = self.git_repo.default_branch
-        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-        all_branch_names = [branch.name for branch in self.git_repo.get_branches()]
-        states = []
-        for branch in [default_branch_name, branch_name]:
-            if branch in all_branch_names:
-                # first get all files in directory part of metadata_file_state_path_prefix
-                files = self._list_directory_contents(directory_part, branch)
-                # check if any of the files has metadata_file_state_path_prefix as prefix
-                for file in files:
-                    if file.path.startswith(metadata_file_state_path_prefix):
-                        # get state from file name taking only the suffix
-                        state = EESSITaskState.from_string(file.name.split(".")[-1])
-                        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
-                        states.append(state)
-        if len(states) == 0:
-            # did not find any file with metadata_file_state_path_prefix as prefix
-            log_message(LoggingScope.TASK_OPS, "INFO", "did not find any file with prefix '%s'",
-                        metadata_file_state_path_prefix)
-            return EESSITaskState.NEW_TASK
-        # sort the states and return the last one
-        states.sort()
-        state = states[-1]
-        log_message(LoggingScope.TASK_OPS_DETAILS, "INFO", "state: '%s'", state)
-        return state
-
     @log_function_entry_exit()
     def _list_directory_contents(self, directory_path: str, branch_name: str = None) -> List[Any]:
         """
@@ -920,39 +748,6 @@ def _determine_feature_branch_name(self) -> str:
         org, repo, pr, seq, _ = pull_request_dir.split("/")
         return f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
 
-    @log_function_entry_exit()
-    def _sync_task_state_file(self, source_branch: str, target_branch: str):
-        """Update task state file from source to target branch"""
-        task_pointer_file = self.description.task_object.remote_file_path
-        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
-        task_state_file_path = f"{pull_request_dir}/TaskState"
-
-        try:
-            # get content from source branch
-            source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch)
-
-            # get current file in target branch
-            target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch)
-
-            # update if content is different
-            if source_content.sha != target_file.sha:
-                result = self.git_repo.update_file(
-                    path=task_state_file_path,
-                    message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}",
-                    content=source_content.decoded_content,
-                    sha=target_file.sha,
-                    branch=target_branch
-                )
-                log_message(LoggingScope.TASK_OPS, "INFO", "Updated '%s'", task_state_file_path)
-                return result
-            else:
-                log_message(LoggingScope.TASK_OPS, "INFO", "No changes needed for '%s'", task_state_file_path)
-                return None
-
-        except Exception as err:
-            log_message(LoggingScope.TASK_OPS, "ERROR", "Error syncing task state file: '%s'", err)
-            return None
-
     @log_function_entry_exit()
     def _update_task_states(self, next_state: EESSITaskState, default_branch_name: str,
                             approved_state: EESSITaskState, feature_branch_name: str):

From ea89ee0e9e01b86eb0a9c94eb58bab4a52c081ef Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Sun, 29 Jun 2025 21:01:22 +0200
Subject: [PATCH 21/26] enable task handling

---
 scripts/automated_ingestion/ingest_bundles.py | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/scripts/automated_ingestion/ingest_bundles.py b/scripts/automated_ingestion/ingest_bundles.py
index b4df5f2f..363d7060 100644
--- a/scripts/automated_ingestion/ingest_bundles.py
+++ b/scripts/automated_ingestion/ingest_bundles.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from eessi_data_object import EESSIDataAndSignatureObject
-from eessi_task import EESSITask
+from eessi_task import EESSITask, EESSITaskState
 from eessi_task_description import EESSITaskDescription
 from eessi_s3_bucket import EESSIS3Bucket
 from eessi_logging import error, log_function_entry_exit, log_message, LoggingScope, LOG_LEVELS, set_logging_scopes
@@ -184,22 +184,22 @@ def main():
 
                     log_message(LoggingScope.GROUP_OPS, "INFO", "Created EESSITask: '%s'", task)
 
-#                    previous_state = None
-#                    current_state = task.determine_state()
-#                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'",
-#                                task_path, current_state.name)
-#                    while (current_state is not None and
-#                            current_state != TaskState.DONE and
-#                            previous_state != current_state):
-#                        previous_state = current_state
-#                        log_message(LoggingScope.GROUP_OPS, "INFO",
-#                                    "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'",
-#                                    task_path, previous_state.name, current_state.name)
-#                        current_state = task.handle()
-#                        log_message(LoggingScope.GROUP_OPS, "INFO",
-#                                    "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'",
-#                                    task_path, previous_state.name, current_state.name)
-#
+                    previous_state = None
+                    current_state = task.determine_state()
+                    log_message(LoggingScope.GROUP_OPS, "INFO", "Task '%s' is in state '%s'",
+                                task_path, current_state.name)
+                    while (current_state is not None and
+                            current_state != EESSITaskState.DONE and
+                            previous_state != current_state):
+                        previous_state = current_state
+                        log_message(LoggingScope.GROUP_OPS, "INFO",
+                                    "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'",
+                                    task_path, previous_state.name, current_state.name)
+                        current_state = task.handle()
+                        log_message(LoggingScope.GROUP_OPS, "INFO",
+                                    "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'",
+                                    task_path, previous_state.name, current_state.name)
+
                 except Exception as err:
                     log_message(LoggingScope.ERROR, "ERROR", "Failed to process task '%s': '%s'", task_path, str(err))
                     continue

From 0bb3fc1a1473d22c59c688cc872fd2f8c412d606 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Fri, 4 Jul 2025 20:40:32 +0200
Subject: [PATCH 22/26] fix determining next sequence number when last PR was
 closed

---
 scripts/automated_ingestion/eessi_task.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 850c426a..34ed2b24 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -557,8 +557,13 @@ def _determine_sequence_number(self) -> int:
                 # the PR is merged, so we use the next sequence number
                 return highest_sequence_number + 1
             else:
-                # the PR is not merged, so we can use the current sequence number
-                return highest_sequence_number
+                # the PR is not merged, it may be closed though
+                if pull_request.state == 'closed':
+                    # PR has been closed, so we return the next sequence number
+                    return highest_sequence_number + 1
+                else:
+                    # PR is not closed, so we return the current highest sequence number
+                    return highest_sequence_number
 
     @log_function_entry_exit()
     def _handle_add_undetermined(self):

From b1663f1c245285e3e1252eebec8f37069d8f45ab Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 6 Aug 2025 13:54:14 +0200
Subject: [PATCH 23/26] return action ADD if metadata file doesn't define it

---
 scripts/automated_ingestion/eessi_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 34ed2b24..bd2b2320 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -114,7 +114,9 @@ def _determine_task_action(self) -> EESSITaskAction:
                 return EESSITaskAction.ADD
             elif action_str == "update":
                 return EESSITaskAction.UPDATE
-        return EESSITaskAction.UNKNOWN
+        # temporarily return EESSITaskAction.ADD as default because the metadata
+        # file does not yet have an action defined yet
+        return EESSITaskAction.ADD
 
     @log_function_entry_exit()
     def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool:

From d7952d8dc006bc6ace0a3d3da85a0cff652e2ce2 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 6 Aug 2025 14:01:57 +0200
Subject: [PATCH 24/26] process reprod dirs when generating tarball overview

---
 .../automated_ingestion/eessi_task_payload.py  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index fe0db162..26d56ef9 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -69,25 +69,31 @@ def analyse_contents(self, config: Dict) -> str:
             swdirs = [  # all directory names with the pattern: <prefix>/software/<name>/<version>
                 member.path
                 for member in members
-                if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, "software", "*", "*"))
+                if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'software', '*', '*'))
             ]
             modfiles = [  # all filenames with the pattern: <prefix>/modules/<category>/<name>/*.lua
                 member.path
                 for member in members
                 if member.isfile()
-                and PurePosixPath(member.path).match(os.path.join(prefix, "modules", "*", "*", "*.lua"))
+                and PurePosixPath(member.path).match(os.path.join(prefix, 'modules', '*', '*', '*.lua'))
             ]
-            other = [  # anything that is not in <prefix>/software nor <prefix>/modules
+            reprod_dirs = [
+                member.path
+                for member in members
+                if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'reprod', '*', '*', '*'))
+            ]
+            other = [  # anything that is not in <prefix>/software nor <prefix>/modules nor <prefix>/reprod
                 member.path
                 for member in members
                 if (
-                    not PurePosixPath(prefix).joinpath("software") in PurePosixPath(member.path).parents
-                    and not PurePosixPath(prefix).joinpath("modules") in PurePosixPath(member.path).parents
+                    not PurePosixPath(prefix).joinpath('software') in PurePosixPath(member.path).parents
+                    and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(member.path).parents
+                    and not PurePosixPath(prefix).joinpath('reprod') in PurePosixPath(member.path).parents
                 )
                 # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*'))
                 # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*'))
             ]
-            members_list = sorted(swdirs + modfiles + other)
+            members_list = sorted(swdirs + modfiles + reprod_dirs + other)
 
         # construct the overview
         overview = config["github"]["task_summary_payload_overview_template"].format(

From 80897fc86a231103d4caa448c9c96bc46a4ab7ea Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 6 Aug 2025 14:05:31 +0200
Subject: [PATCH 25/26] remove unused type

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index bd2b2320..86fbd8df 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,6 +1,6 @@
 from enum import Enum, auto
 from functools import total_ordering
-from typing import Dict, List, Tuple, Optional, Any
+from typing import Dict, List, Optional, Any
 
 import base64
 import os

From 76b3d3434cacce46336bc02df91874bd6aecf88d Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Fri, 8 Aug 2025 10:12:20 +0200
Subject: [PATCH 26/26] reduce limit to less than 3

---
 scripts/automated_ingestion/eessi_task_payload.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 26d56ef9..112fcfd1 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -48,7 +48,11 @@ def analyse_contents(self, config: Dict) -> str:
         tar_num_members = len(members)
         paths = sorted([m.path for m in members])
 
-        if tar_num_members < 100:
+        # reduce limit for full listing from 100 to 3 because the description can
+        # include 10s of tarballs and thus even 100 maybe too many; using a very
+        # small number can still be useful if there is only a very small number
+        # of files, say an architecture specific configuration file
+        if tar_num_members < 3:
             tar_members_desc = "Full listing of the contents of the tarball:"
             members_list = paths