OpenCOMPES · zain-sohail · Jun 16, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -20,7 +20,8 @@ jobs:
       run: |
         git lfs pull
         python -m pip install --upgrade pip
-        python -m pip install pycodestyle pylint mypy types-PyYAML pytest coverage coveralls
+        pip install pycodestyle pylint mypy pytest types-PyYAML types-requests coverage coveralls
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Install package
       run: |
         python -m pip install .

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,13 +28,15 @@ threadpoolctl = "^3.1.0"
 tifffile = ">=2022.2.9, <2023.0.0"
 tqdm = "^4.62.3"
 xarray = "^0.20.2"
+joblib = "^1.2.0"
 jupyter = {version = "^1.0.0", extras = ["notebook"], optional = true}
 ipykernel = {version = "^6.9.1", extras = ["notebook"], optional = true}
 sphinx = {version = ">4.4.0", extras = ["docs"], optional = true}
 sphinx-rtd-theme = {version = ">1.0.0", extras = ["docs"], optional = true}
 tomlkit = {version = ">0.10.0", extras = ["docs"], optional = true}
 sphinx-autodoc-typehints = {version = ">1.17.0", extras = ["docs"], optional = true}
 
+
 [tool.poetry.extras]
 notebook = ["jupyter", "ipykernel"]
 docs = ["Sphinx", "sphinx-rtd-theme", "tomlkit", "sphinx-autodoc-typehints"]

diff --git a/sed/core/processor.py b/sed/core/processor.py
@@ -60,11 +60,12 @@ def __init__(
         dataframe: Union[pd.DataFrame, ddf.DataFrame] = None,
         files: List[str] = None,
         folder: str = None,
+        runs: Sequence[str] = None,
         collect_metadata: bool = False,
         **kwds,
     ):
         """Processor class of sed. Contains wrapper functions defining a work flow
-        for data correction, calibration and binning.
+        for data correction, calibration, and binning.
 
         Args:
             metadata (dict, optional): Dict of external Metadata. Defaults to None.
@@ -76,6 +77,8 @@ def __init__(
                 the config. Defaults to None.
             folder (str, optional): Folder containing files to pass to the loader
                 defined in the config. Defaults to None.
+            runs (Sequence[str], optional): List of run identifiers to pass to the loader
+                defined in the config. Defaults to None.
             collect_metadata (bool): Option to collect metadata from files.
                 Defaults to False.
             **kwds: Keyword arguments passed to the reader.
@@ -131,12 +134,18 @@ def __init__(
                 self.use_copy_tool = False
 
         # Load data if provided:
-        if dataframe is not None or files is not None or folder is not None:
+        if (
+            dataframe is not None
+            or files is not None
+            or folder is not None
+            or runs is not None
+        ):
             self.load(
                 dataframe=dataframe,
                 metadata=metadata,
                 files=files,
                 folder=folder,
+                runs=runs,
                 collect_metadata=collect_metadata,
                 **kwds,
             )
@@ -257,6 +266,7 @@ def load(
         metadata: dict = None,
         files: List[str] = None,
         folder: str = None,
+        runs: Sequence[str] = None,
         collect_metadata: bool = False,
         **kwds,
     ):
@@ -269,10 +279,10 @@ def load(
             metadata (dict, optional): Dict of external Metadata. Defaults to None.
             files (List[str], optional): List of file paths to pass to the loader.
                 Defaults to None.
+            runs (Sequence[str], optional): List of run identifiers to pass to the
+                loader. Defaults to None.
             folder (str, optional): Folder path to pass to the loader.
                 Defaults to None.
-            collect_metadata (bool): Option to collect metadata from files.
-                Defaults to False.
 
         Raises:
             ValueError: Raised if no valid input is provided.
@@ -281,29 +291,50 @@ def load(
             metadata = {}
         if dataframe is not None:
             self._dataframe = dataframe
+        elif runs is not None:
+            # If runs are provided, we only use the copy tool if also folder is provided.
+            # In that case, we copy the whole provided base folder tree, and pass the copied
+            # version to the loader as base folder to look for the runs.
+            if folder is not None:
+                dataframe, metadata = self.loader.read_dataframe(
+                    folders=cast(str, self.cpy(folder)),
+                    runs=runs,
+                    metadata=metadata,
+                    collect_metadata=collect_metadata,
+                    **kwds,
+                )
+            else:
+                dataframe, metadata = self.loader.read_dataframe(
+                    runs=runs,
+                    metadata=metadata,
+                    collect_metadata=collect_metadata,
+                    **kwds,
+                )
+
         elif folder is not None:
             dataframe, metadata = self.loader.read_dataframe(
-                folder=cast(str, self.cpy(folder)),
+                folders=cast(str, self.cpy(folder)),
                 metadata=metadata,
                 collect_metadata=collect_metadata,
                 **kwds,
             )
-            self._dataframe = dataframe
-            self._files = self.loader.files
+
         elif files is not None:
             dataframe, metadata = self.loader.read_dataframe(
                 files=cast(List[str], self.cpy(files)),
                 metadata=metadata,
                 collect_metadata=collect_metadata,
                 **kwds,
             )
-            self._dataframe = dataframe
-            self._files = self.loader.files
+
         else:
             raise ValueError(
-                "Either 'dataframe', 'files' or 'folder' needs to be privided!",
+                "Either 'dataframe', 'files', 'folder', or 'runs' needs to be provided!",
             )
 
+        self._dataframe = dataframe
+        self._files = self.loader.files
+
         for key in metadata:
             self._attributes.add(
                 entry=metadata[key],

diff --git a/sed/loader/base/loader.py b/sed/loader/base/loader.py
@@ -8,6 +8,7 @@
 from typing import List
 from typing import Sequence
 from typing import Tuple
+from typing import Union
 
 import dask.dataframe as ddf
 import numpy as np
@@ -40,34 +41,40 @@ def __init__(
         self._config = config if config is not None else {}
 
         self.files: List[str] = []
+        self.runs: List[str] = []
         self.metadata: Dict[Any, Any] = {}
 
     @abstractmethod
     def read_dataframe(
         self,
-        files: Sequence[str] = None,
-        folder: str = None,
+        files: Union[str, Sequence[str]] = None,
+        folders: Union[str, Sequence[str]] = None,
+        runs: Union[str, Sequence[str]] = None,
         ftype: str = None,
         metadata: dict = None,
         collect_metadata: bool = False,
         **kwds,
     ) -> Tuple[ddf.DataFrame, dict]:
-        """Reads data from given files or folder and returns a dask dataframe
+        """Reads data from given files, folder, or runs and returns a dask dataframe
         and corresponding metadata.
 
         Args:
-            files (Sequence[str], optional): List of file paths. Defaults to None.
-            folder (str, optional): Path to folder where files are stored. Path has
-                the priority such that if it's specified, the specified files will
-                be ignored. Defaults to None.
+            files (Union[str, Sequence[str]], optional): File path(s) to process.
+                Defaults to None.
+            folders (Union[str, Sequence[str]], optional): Path to folder(s) where files
+                are stored. Path has priority such that if it's specified, the specified
+                files will be ignored. Defaults to None.
+            runs (Union[str, Sequence[str]], optional): Run identifier(s). Corresponding
+                files will be located in the location provided by ``folders``. Takes
+                precendence over ``files`` and ``folders``. Defaults to None.
             ftype (str, optional): File type to read ('parquet', 'json', 'csv', etc).
                 If a folder path is given, all files with the specified extension are
                 read into the dataframe in the reading order. Defaults to None.
-            metadata (dict, optional): Manual meta data dictionary. Auto-generated
-                meta data are added to it. Defaults to None.
+            metadata (dict, optional): Manual metadata dictionary. Auto-generated
+                metadata will be added to it. Defaults to None.
             collect_metadata (bool): Option to collect metadata from files. Requires
                 a valid config dict. Defaults to False.
-            **kwds: keyword arguments. Se describtion in respective loader.
+            **kwds: keyword arguments. See description in respective loader.
 
         Returns:
             Tuple[ddf.DataFrame, dict]: Dask dataframe and metadata read from
@@ -77,31 +84,69 @@ def read_dataframe(
         if metadata is None:
             metadata = {}
 
-        if folder is not None:
-            folder = os.path.realpath(folder)
-            files = gather_files(
-                folder=folder,
-                extension=ftype,
-                file_sorting=True,
-                **kwds,
-            )
+        if runs is not None:
+            if isinstance(runs, (str, int)):
+                runs = [runs]
+            self.runs = list(runs)
+            files = []
+            for run in runs:
+                files.extend(self.get_files_from_run_id(run, folders, **kwds))
+
+        elif folders is not None:
+            if isinstance(folders, str):
+                folders = [folders]
+            files = []
+            for folder in folders:
+                folder = os.path.realpath(folder)
+                files.extend(
+                    gather_files(
+                        folder=folder,
+                        extension=ftype,
+                        file_sorting=True,
+                        **kwds,
+                    ),
+                )
 
         elif files is None:
             raise ValueError(
-                "Either the folder or file path should be provided!",
+                "Either folder, file paths, or runs should be provided!",
             )
-        else:
-            files = [os.path.realpath(file) for file in files]
 
-        self.files = files
+        if files is not None:
+            if isinstance(files, str):
+                files = [files]
+            files = [os.path.realpath(file) for file in files]
+            self.files = files
 
         self.metadata = deepcopy(metadata)
 
         if not files:
-            raise FileNotFoundError("No valid files found!")
+            raise FileNotFoundError("No valid files or runs found!")
 
         return None, None
 
+    @abstractmethod
+    def get_files_from_run_id(
+        self,
+        run_id: str,
+        folders: Union[str, Sequence[str]] = None,
+        extension: str = None,
+        **kwds,
+    ) -> List[str]:
+        """Locate the files for a given run identifier.
+
+        Args:
+            run_id (str): The run identifier to locate.
+            folders (Union[str, Sequence[str]], optional): The directory(ies) where the raw
+                data is located. Defaults to None.
+            extension (str, optional): The file extension. Defaults to None.
+            kwds: Keyword arguments
+
+        Return:
+            List[str]: List of files for the given run.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def get_count_rate(
         self,

diff --git a/sed/loader/flash/__init__.py b/sed/loader/flash/__init__.py