Skip to content

Flash loader updated #118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
5a32f5b
flash module
zain-sohail Apr 26, 2023
4d31a0f
add test data
zain-sohail Apr 26, 2023
441ad14
fix pylint exception warning
zain-sohail Apr 26, 2023
4e0eeea
fix pylint exception warning
zain-sohail Apr 26, 2023
a5d86ed
some more error fixes
zain-sohail Apr 26, 2023
6ac4fa8
loader.py
zain-sohail May 1, 2023
be6a3cc
Create __init__.py
zain-sohail May 1, 2023
408308c
conflict resolved
zain-sohail May 23, 2023
8f14198
fix linting errors
zain-sohail May 4, 2023
59004ea
further linting errors
zain-sohail May 4, 2023
285d878
fix linting issues
rettigl May 5, 2023
67d1147
add config for loader tests
rettigl May 5, 2023
7c8ce1f
fix paths to work with github actions
rettigl May 5, 2023
4532380
- fixed the recursive parse file error
zain-sohail May 29, 2023
2e805c2
- add missing utils file
zain-sohail May 29, 2023
72728a1
parse_h5_keys moved to loader utils
zain-sohail Jun 1, 2023
40696b7
test fix
zain-sohail Jun 1, 2023
35a60f2
Metadata class and necessary user details now from config file
zain-sohail Jun 1, 2023
c9a8d49
docstrings and seperating paths in json
zain-sohail Jun 3, 2023
172f7a2
add runs as an identifier
zain-sohail Jun 5, 2023
e3ec795
remove casts
zain-sohail Jun 5, 2023
cdb686f
updated docstrings
zain-sohail Jun 5, 2023
97b8358
fix small bug with typehints
zain-sohail Jun 7, 2023
b1d53b2
simplify getting channels by format
zain-sohail Jun 7, 2023
7e221ff
metadata small fix
zain-sohail Jun 7, 2023
67de4dc
modified base loader and processor to support runs as loader option, …
rettigl Jun 9, 2023
5d694b4
working flash loader
rettigl Jun 9, 2023
a4c4091
linting and bugfixes
rettigl Jun 9, 2023
7ec4390
add dummy function to mpes and generic loaders
rettigl Jun 10, 2023
f9eb60e
add option for multiple folders to general loader infrastructure
rettigl Jun 12, 2023
75b7d81
Bugfix to allow again single run w/o list, and prevent provided folde…
rettigl Jun 13, 2023
7d5301d
add parametrized tests for loaders covering files, folders and runs
rettigl Jun 13, 2023
c1b1af1
add stripped down test data for FLASH reader
rettigl Jun 15, 2023
fcf2c92
add option for runs as int on runtime, and debug flash tests in guthu…
rettigl Jun 15, 2023
edef597
debug github actions
rettigl Jun 15, 2023
be19cbc
fix bug for single provided file, and remove debug info
rettigl Jun 15, 2023
43a8782
Merge pull request #127 from OpenCOMPES/runs_loader_option
rettigl Jun 15, 2023
e7d1cd2
Merge branch 'main' into flash-new
zain-sohail Jun 16, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ jobs:
run: |
git lfs pull
python -m pip install --upgrade pip
python -m pip install pycodestyle pylint mypy types-PyYAML pytest coverage coveralls
pip install pycodestyle pylint mypy pytest types-PyYAML types-requests coverage coveralls
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Install package
run: |
python -m pip install .
Expand Down
226 changes: 113 additions & 113 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ threadpoolctl = "^3.1.0"
tifffile = ">=2022.2.9, <2023.0.0"
tqdm = "^4.62.3"
xarray = "^0.20.2"
joblib = "^1.2.0"
jupyter = {version = "^1.0.0", extras = ["notebook"], optional = true}
ipykernel = {version = "^6.9.1", extras = ["notebook"], optional = true}
sphinx = {version = ">4.4.0", extras = ["docs"], optional = true}
sphinx-rtd-theme = {version = ">1.0.0", extras = ["docs"], optional = true}
tomlkit = {version = ">0.10.0", extras = ["docs"], optional = true}
sphinx-autodoc-typehints = {version = ">1.17.0", extras = ["docs"], optional = true}


[tool.poetry.extras]
notebook = ["jupyter", "ipykernel"]
docs = ["Sphinx", "sphinx-rtd-theme", "tomlkit", "sphinx-autodoc-typehints"]
Expand Down
51 changes: 41 additions & 10 deletions sed/core/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,12 @@ def __init__(
dataframe: Union[pd.DataFrame, ddf.DataFrame] = None,
files: List[str] = None,
folder: str = None,
runs: Sequence[str] = None,
collect_metadata: bool = False,
**kwds,
):
"""Processor class of sed. Contains wrapper functions defining a work flow
for data correction, calibration and binning.
for data correction, calibration, and binning.

Args:
metadata (dict, optional): Dict of external Metadata. Defaults to None.
Expand All @@ -76,6 +77,8 @@ def __init__(
the config. Defaults to None.
folder (str, optional): Folder containing files to pass to the loader
defined in the config. Defaults to None.
runs (Sequence[str], optional): List of run identifiers to pass to the loader
defined in the config. Defaults to None.
collect_metadata (bool): Option to collect metadata from files.
Defaults to False.
**kwds: Keyword arguments passed to the reader.
Expand Down Expand Up @@ -131,12 +134,18 @@ def __init__(
self.use_copy_tool = False

# Load data if provided:
if dataframe is not None or files is not None or folder is not None:
if (
dataframe is not None
or files is not None
or folder is not None
or runs is not None
):
self.load(
dataframe=dataframe,
metadata=metadata,
files=files,
folder=folder,
runs=runs,
collect_metadata=collect_metadata,
**kwds,
)
Expand Down Expand Up @@ -257,6 +266,7 @@ def load(
metadata: dict = None,
files: List[str] = None,
folder: str = None,
runs: Sequence[str] = None,
collect_metadata: bool = False,
**kwds,
):
Expand All @@ -269,10 +279,10 @@ def load(
metadata (dict, optional): Dict of external Metadata. Defaults to None.
files (List[str], optional): List of file paths to pass to the loader.
Defaults to None.
runs (Sequence[str], optional): List of run identifiers to pass to the
loader. Defaults to None.
folder (str, optional): Folder path to pass to the loader.
Defaults to None.
collect_metadata (bool): Option to collect metadata from files.
Defaults to False.

Raises:
ValueError: Raised if no valid input is provided.
Expand All @@ -281,29 +291,50 @@ def load(
metadata = {}
if dataframe is not None:
self._dataframe = dataframe
elif runs is not None:
# If runs are provided, we only use the copy tool if also folder is provided.
# In that case, we copy the whole provided base folder tree, and pass the copied
# version to the loader as base folder to look for the runs.
if folder is not None:
dataframe, metadata = self.loader.read_dataframe(
folders=cast(str, self.cpy(folder)),
runs=runs,
metadata=metadata,
collect_metadata=collect_metadata,
**kwds,
)
else:
dataframe, metadata = self.loader.read_dataframe(
runs=runs,
metadata=metadata,
collect_metadata=collect_metadata,
**kwds,
)

elif folder is not None:
dataframe, metadata = self.loader.read_dataframe(
folder=cast(str, self.cpy(folder)),
folders=cast(str, self.cpy(folder)),
metadata=metadata,
collect_metadata=collect_metadata,
**kwds,
)
self._dataframe = dataframe
self._files = self.loader.files

elif files is not None:
dataframe, metadata = self.loader.read_dataframe(
files=cast(List[str], self.cpy(files)),
metadata=metadata,
collect_metadata=collect_metadata,
**kwds,
)
self._dataframe = dataframe
self._files = self.loader.files

else:
raise ValueError(
"Either 'dataframe', 'files' or 'folder' needs to be privided!",
"Either 'dataframe', 'files', 'folder', or 'runs' needs to be provided!",
)

self._dataframe = dataframe
self._files = self.loader.files

for key in metadata:
self._attributes.add(
entry=metadata[key],
Expand Down
91 changes: 68 additions & 23 deletions sed/loader/base/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import List
from typing import Sequence
from typing import Tuple
from typing import Union

import dask.dataframe as ddf
import numpy as np
Expand Down Expand Up @@ -40,34 +41,40 @@ def __init__(
self._config = config if config is not None else {}

self.files: List[str] = []
self.runs: List[str] = []
self.metadata: Dict[Any, Any] = {}

@abstractmethod
def read_dataframe(
self,
files: Sequence[str] = None,
folder: str = None,
files: Union[str, Sequence[str]] = None,
folders: Union[str, Sequence[str]] = None,
runs: Union[str, Sequence[str]] = None,
ftype: str = None,
metadata: dict = None,
collect_metadata: bool = False,
**kwds,
) -> Tuple[ddf.DataFrame, dict]:
"""Reads data from given files or folder and returns a dask dataframe
"""Reads data from given files, folder, or runs and returns a dask dataframe
and corresponding metadata.

Args:
files (Sequence[str], optional): List of file paths. Defaults to None.
folder (str, optional): Path to folder where files are stored. Path has
the priority such that if it's specified, the specified files will
be ignored. Defaults to None.
files (Union[str, Sequence[str]], optional): File path(s) to process.
Defaults to None.
folders (Union[str, Sequence[str]], optional): Path to folder(s) where files
are stored. Path has priority such that if it's specified, the specified
files will be ignored. Defaults to None.
runs (Union[str, Sequence[str]], optional): Run identifier(s). Corresponding
files will be located in the location provided by ``folders``. Takes
precendence over ``files`` and ``folders``. Defaults to None.
ftype (str, optional): File type to read ('parquet', 'json', 'csv', etc).
If a folder path is given, all files with the specified extension are
read into the dataframe in the reading order. Defaults to None.
metadata (dict, optional): Manual meta data dictionary. Auto-generated
meta data are added to it. Defaults to None.
metadata (dict, optional): Manual metadata dictionary. Auto-generated
metadata will be added to it. Defaults to None.
collect_metadata (bool): Option to collect metadata from files. Requires
a valid config dict. Defaults to False.
**kwds: keyword arguments. Se describtion in respective loader.
**kwds: keyword arguments. See description in respective loader.

Returns:
Tuple[ddf.DataFrame, dict]: Dask dataframe and metadata read from
Expand All @@ -77,31 +84,69 @@ def read_dataframe(
if metadata is None:
metadata = {}

if folder is not None:
folder = os.path.realpath(folder)
files = gather_files(
folder=folder,
extension=ftype,
file_sorting=True,
**kwds,
)
if runs is not None:
if isinstance(runs, (str, int)):
runs = [runs]
self.runs = list(runs)
files = []
for run in runs:
files.extend(self.get_files_from_run_id(run, folders, **kwds))

elif folders is not None:
if isinstance(folders, str):
folders = [folders]
files = []
for folder in folders:
folder = os.path.realpath(folder)
files.extend(
gather_files(
folder=folder,
extension=ftype,
file_sorting=True,
**kwds,
),
)

elif files is None:
raise ValueError(
"Either the folder or file path should be provided!",
"Either folder, file paths, or runs should be provided!",
)
else:
files = [os.path.realpath(file) for file in files]

self.files = files
if files is not None:
if isinstance(files, str):
files = [files]
files = [os.path.realpath(file) for file in files]
self.files = files

self.metadata = deepcopy(metadata)

if not files:
raise FileNotFoundError("No valid files found!")
raise FileNotFoundError("No valid files or runs found!")

return None, None

@abstractmethod
def get_files_from_run_id(
self,
run_id: str,
folders: Union[str, Sequence[str]] = None,
extension: str = None,
**kwds,
) -> List[str]:
"""Locate the files for a given run identifier.

Args:
run_id (str): The run identifier to locate.
folders (Union[str, Sequence[str]], optional): The directory(ies) where the raw
data is located. Defaults to None.
extension (str, optional): The file extension. Defaults to None.
kwds: Keyword arguments

Return:
List[str]: List of files for the given run.
"""
raise NotImplementedError

@abstractmethod
def get_count_rate(
self,
Expand Down
Empty file added sed/loader/flash/__init__.py
Empty file.
Loading