Skip to content

Consolidate loaders #305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions linkml_runtime/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,27 @@
from linkml_runtime.loaders.tsv_loader import TSVLoader
from linkml_runtime.loaders.yaml_loader import YAMLLoader
from linkml_runtime.loaders.csv_loader import CSVLoader
from linkml_runtime.loaders.passthrough_loader import PassthroughLoader

json_loader = JSONLoader()
rdf_loader = RDFLoader()
rdflib_loader = RDFLibLoader()
yaml_loader = YAMLLoader()
csv_loader = CSVLoader()
tsv_loader = TSVLoader()

__all__ = [
"JSONLoader",
"RDFLoader",
"RDFLibLoader",
"TSVLoader",
"YAMLLoader",
"CSVLoader",
"PassthroughLoader",
"json_loader",
"rdf_loader",
"rdflib_loader",
"yaml_loader",
"csv_loader",
"tsv_loader"
]
41 changes: 39 additions & 2 deletions linkml_runtime/loaders/delimited_file_loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import csv
import re
from abc import ABC, abstractmethod
from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer
import json
from typing import Type, Union, List
from typing import Iterator, Optional, Type, Union, List, TextIO
from linkml_runtime.utils.yamlutils import YAMLRoot
from pydantic import BaseModel

Expand All @@ -14,6 +16,15 @@

class DelimitedFileLoader(Loader, ABC):

def __init__(self,
source: Union[str, dict, TextIO] = None,
skip_empty_rows: bool = False,
index_slot_name: Optional[str] = None):
super().__init__(source)
self.skip_empty_rows = skip_empty_rows
self.index_slot_name = index_slot_name


@property
@abstractmethod
def delimiter(self):
Expand Down Expand Up @@ -60,4 +71,30 @@ def _get_json_str_to_load(self,
configmap = get_configmap(schemaview, index_slot)
config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
objs = unflatten_from_csv(input, config=config, **kwargs)
return json.dumps({index_slot: objs})
return json.dumps({index_slot: objs})

def _rows(self) -> Iterator[dict]:
with open(self.source) as file:
reader: csv.DictReader = csv.DictReader(file, delimiter=self.delimiter, skipinitialspace=True)
for row in reader:
if self.skip_empty_rows and not any(row.values()):
continue
yield {k: _parse_numeric(v) for k, v in row.items() if k is not None and v != ""}

def iter_instances(self) -> Iterator[dict]:
if self.index_slot_name is not None:
yield {self.index_slot_name: list(self._rows())}
else:
yield from self._rows()

def _parse_numeric(value: str):
if not isinstance(value, str) or not re.search(r"[0-9]", value):
return value
try:
return int(value)
except (TypeError, ValueError):
pass
try:
return float(value)
except (TypeError, ValueError, OverflowError):
return value
17 changes: 16 additions & 1 deletion linkml_runtime/loaders/json_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import logging
from typing import Union, TextIO, Optional, Dict, Type, List
from typing import Any, Iterator, Union, TextIO, Optional, Dict, Type, List

from hbreader import FileInfo

Expand Down Expand Up @@ -34,3 +34,18 @@ def load_any(self,
logging.warning(f"Warning: input type mismatch. Expected: {target_class.__name__}, Actual: {typ}")

return self._construct_target_class(data_as_dict, target_class)

def iter_instances(self) -> Iterator[Any]:
"""Lazily yield instance from JSON source.

If the root of the JSON is an array, yield each element of the array. Otherwise,
yield the root element itself.

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
data = self.load_as_dict(self.source)
if isinstance(data, list):
yield from data
else:
yield data
18 changes: 16 additions & 2 deletions linkml_runtime/loaders/loader_root.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import TextIO, Union, Optional, Callable, Dict, Type, Any, List
from typing import Iterator, TextIO, Union, Optional, Callable, Dict, Type, Any, List
from logging import getLogger

from pydantic import BaseModel
Expand All @@ -15,6 +15,10 @@

class Loader(ABC):

def __init__(self, source: Union[str, dict, TextIO] = None):
self.source = source
super().__init__()

@staticmethod
def json_clean(inp: Any) -> Any:
"""
Expand Down Expand Up @@ -119,7 +123,17 @@ def loads(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]], *,
"""
return self.load(source, target_class, metadata=metadata)

def _construct_target_class(self,
@abstractmethod
def iter_instances(self) -> Iterator[Any]:
"""Lazily load data instances from the source
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor: can be iterated on in new PR]

Can we clarify what a data instance would be?

It seems that this is canonically a dict, never an instance of a class (whether dataclass or pydantic)? or would it be (e.g. pkl serialization)? Would rdflib_loader eventually implement this with a Triple/Quad object, or a 3-or-4-tuple?

I'm tending towards a more predictable signature (iterates over dicts) with some guarantees

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

totally - so the type annotation here would be refined in the child loaders, keeping it "any" here is just to say "there will be some iterator (it could actually just be Iterator and then we would do Iterator[dict[str, JsonObj | dict | list]] or whatever in the child objects. We would make this type a union of all the child types but it wouldn't really give us much bc the child impl should override it


:return: Iterator over data instances
:rtype: Iterator[Any]
"""
pass


def _construct_target_class(self,
data_as_dict: Union[dict, List[dict]],
target_class: Union[Type[YAMLRoot], Type[BaseModel]]) -> Optional[Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]]:
if data_as_dict:
Expand Down
24 changes: 24 additions & 0 deletions linkml_runtime/loaders/passthrough_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any, Iterator

from linkml_runtime.loaders.loader_root import Loader


class PassthroughLoader(Loader):
"""A loader that passes through from an existing Iterator

:param source: An Iterator
"""

def __init__(self, source: Iterator) -> None:
super().__init__(source)

def iter_instances(self) -> Iterator[Any]:
"""Pass through instances from an Iterator

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
yield from self.source

def load_any(self, *args, **kwargs):
raise NotImplementedError('Passthrough loader doesnt actually load anything')
5 changes: 4 additions & 1 deletion linkml_runtime/loaders/rdf_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, TextIO, Optional, Type, List
from typing import Any, Union, TextIO, Optional, Type, List, Iterator

from hbreader import FileInfo

Expand Down Expand Up @@ -90,3 +90,6 @@ def loader(data: Union[str, dict], _: FileInfo) -> Optional[dict]:
# TODO: Make the SSL option a settable parameter in the package itself
with no_ssl_verification():
return self.load_source(source, loader, target_class, accept_header=RDF_MIME_TYPES, metadata=metadata)

def iter_instances(self) -> Iterator[Any]:
raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
5 changes: 3 additions & 2 deletions linkml_runtime/loaders/rdflib_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import urllib
from copy import copy
from dataclasses import dataclass
from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set
from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set, Iterator

from curies import Converter
from hbreader import FileInfo
Expand Down Expand Up @@ -276,4 +276,5 @@ def loads(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot]:
def load_any(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]:
return self.load(source, **kwargs)


def iter_instances(self) -> Iterator[Any]:
raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
18 changes: 17 additions & 1 deletion linkml_runtime/loaders/yaml_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from io import StringIO
from typing import Union, TextIO, Optional, Dict, Type, List
from typing import Union, TextIO, Optional, Dict, Type, List, Iterator, Any

import yaml
from hbreader import FileInfo
Expand Down Expand Up @@ -51,3 +51,19 @@ def loads_any(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]],
@return: instance of taarget_class
"""
return self.load_any(source, target_class, metadata=metadata)

def iter_instances(self) -> Iterator[Any]:
"""Lazily yield instances from YAML source.

If the root of the document is an array, yield each element of the array. Otherwise,
yield the root element itself. Repeat for each document in the YAML file.

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
with open(self.source) as source_file:
for document in yaml.safe_load_all(source_file):
if isinstance(document, list):
yield from document
else:
yield document