linkml · sneakers-the-rat · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/linkml_runtime/loaders/__init__.py b/linkml_runtime/loaders/__init__.py
@@ -4,10 +4,27 @@
 from linkml_runtime.loaders.tsv_loader import TSVLoader
 from linkml_runtime.loaders.yaml_loader import YAMLLoader
 from linkml_runtime.loaders.csv_loader import CSVLoader
+from linkml_runtime.loaders.passthrough_loader import PassthroughLoader
 
 json_loader = JSONLoader()
 rdf_loader = RDFLoader()
 rdflib_loader = RDFLibLoader()
 yaml_loader = YAMLLoader()
 csv_loader = CSVLoader()
 tsv_loader = TSVLoader()
+
+__all__ = [
+    "JSONLoader",
+    "RDFLoader",
+    "RDFLibLoader",
+    "TSVLoader",
+    "YAMLLoader",
+    "CSVLoader",
+    "PassthroughLoader",
+    "json_loader",
+    "rdf_loader",
+    "rdflib_loader",
+    "yaml_loader",
+    "csv_loader",
+    "tsv_loader"
+]
diff --git a/linkml_runtime/loaders/delimited_file_loader.py b/linkml_runtime/loaders/delimited_file_loader.py
@@ -1,7 +1,9 @@
+import csv
+import re
 from abc import ABC, abstractmethod
 from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer
 import json
-from typing import Type, Union, List
+from typing import Iterator, Optional, Type, Union, List, TextIO
 from linkml_runtime.utils.yamlutils import YAMLRoot
 from pydantic import BaseModel
 
@@ -14,6 +16,15 @@
 
 class DelimitedFileLoader(Loader, ABC):
 
+    def __init__(self,
+                 source: Union[str, dict, TextIO] = None,
+                 skip_empty_rows: bool = False,
+                 index_slot_name: Optional[str] = None):
+        super().__init__(source)
+        self.skip_empty_rows = skip_empty_rows
+        self.index_slot_name = index_slot_name
+
+
     @property
     @abstractmethod
     def delimiter(self):
@@ -60,4 +71,30 @@ def _get_json_str_to_load(self,
         configmap = get_configmap(schemaview, index_slot)
         config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
         objs = unflatten_from_csv(input, config=config, **kwargs)
-        return json.dumps({index_slot: objs})
+        return json.dumps({index_slot: objs})
+
+    def _rows(self) -> Iterator[dict]:
+        with open(self.source) as file:
+            reader: csv.DictReader = csv.DictReader(file, delimiter=self.delimiter, skipinitialspace=True)
+            for row in reader:
+                if self.skip_empty_rows and not any(row.values()):
+                    continue
+                yield {k: _parse_numeric(v) for k, v in row.items() if k is not None and v != ""}
+
+    def iter_instances(self) -> Iterator[dict]:
+        if self.index_slot_name is not None:
+            yield {self.index_slot_name: list(self._rows())}
+        else:
+            yield from self._rows()
+
+def _parse_numeric(value: str):
+    if not isinstance(value, str) or not re.search(r"[0-9]", value):
+        return value
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        pass
+    try:
+        return float(value)
+    except (TypeError, ValueError, OverflowError):
+        return value
diff --git a/linkml_runtime/loaders/json_loader.py b/linkml_runtime/loaders/json_loader.py
@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Union, TextIO, Optional, Dict, Type, List
+from typing import Any, Iterator, Union, TextIO, Optional, Dict, Type, List
 
 from hbreader import FileInfo
 
@@ -34,3 +34,18 @@ def load_any(self,
                 logging.warning(f"Warning: input type mismatch. Expected: {target_class.__name__}, Actual: {typ}")
 
         return self._construct_target_class(data_as_dict, target_class)
+
+    def iter_instances(self) -> Iterator[Any]:
+        """Lazily yield instance from JSON source.
+
+        If the root of the JSON is an array, yield each element of the array. Otherwise,
+        yield the root element itself.
+
+        :return: Iterator over data instances
+        :rtype: Iterator[Any]
+        """
+        data = self.load_as_dict(self.source)
+        if isinstance(data, list):
+            yield from data
+        else:
+            yield data
diff --git a/linkml_runtime/loaders/loader_root.py b/linkml_runtime/loaders/loader_root.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import TextIO, Union, Optional, Callable, Dict, Type, Any, List
+from typing import Iterator, TextIO, Union, Optional, Callable, Dict, Type, Any, List
 from logging import getLogger
 
 from pydantic import BaseModel
@@ -15,6 +15,10 @@
 
 class Loader(ABC):
 
+    def __init__(self, source: Union[str, dict, TextIO] = None):
+        self.source = source
+        super().__init__()
+
     @staticmethod
     def json_clean(inp: Any) -> Any:
         """
@@ -119,7 +123,17 @@ def loads(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]], *,
         """
         return self.load(source, target_class, metadata=metadata)
 
-    def _construct_target_class(self, 
+    @abstractmethod
+    def iter_instances(self) -> Iterator[Any]:
+        """Lazily load data instances from the source
+
+        :return: Iterator over data instances
+        :rtype: Iterator[Any]
+        """
+        pass
+
+
+    def _construct_target_class(self,
                                 data_as_dict: Union[dict, List[dict]],
                                 target_class: Union[Type[YAMLRoot], Type[BaseModel]]) -> Optional[Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]]:
         if data_as_dict:

diff --git a/linkml_runtime/loaders/passthrough_loader.py b/linkml_runtime/loaders/passthrough_loader.py
@@ -0,0 +1,24 @@
+from typing import Any, Iterator
+
+from linkml_runtime.loaders.loader_root import Loader
+
+
+class PassthroughLoader(Loader):
+    """A loader that passes through from an existing Iterator
+
+    :param source: An Iterator
+    """
+
+    def __init__(self, source: Iterator) -> None:
+        super().__init__(source)
+
+    def iter_instances(self) -> Iterator[Any]:
+        """Pass through instances from an Iterator
+
+        :return: Iterator over data instances
+        :rtype: Iterator[Any]
+        """
+        yield from self.source
+
+    def load_any(self, *args, **kwargs):
+        raise NotImplementedError('Passthrough loader doesnt actually load anything')
diff --git a/linkml_runtime/loaders/rdf_loader.py b/linkml_runtime/loaders/rdf_loader.py
@@ -1,4 +1,4 @@
-from typing import Union, TextIO, Optional, Type, List
+from typing import Any, Union, TextIO, Optional, Type, List, Iterator
 
 from hbreader import FileInfo
 
@@ -90,3 +90,6 @@ def loader(data: Union[str, dict], _: FileInfo) -> Optional[dict]:
         # TODO: Make the SSL option a settable parameter in the package itself
         with no_ssl_verification():
             return self.load_source(source, loader, target_class, accept_header=RDF_MIME_TYPES, metadata=metadata)
+
+    def iter_instances(self) -> Iterator[Any]:
+        raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
diff --git a/linkml_runtime/loaders/rdflib_loader.py b/linkml_runtime/loaders/rdflib_loader.py
@@ -2,7 +2,7 @@
 import urllib
 from copy import copy
 from dataclasses import dataclass
-from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set
+from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set, Iterator
 
 from curies import Converter
 from hbreader import FileInfo
@@ -276,4 +276,5 @@ def loads(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot]:
     def load_any(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]:
         return self.load(source, **kwargs)
 
-
+    def iter_instances(self) -> Iterator[Any]:
+        raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
diff --git a/linkml_runtime/loaders/yaml_loader.py b/linkml_runtime/loaders/yaml_loader.py
@@ -1,6 +1,6 @@
 import os
 from io import StringIO
-from typing import Union, TextIO, Optional, Dict, Type, List
+from typing import Union, TextIO, Optional, Dict, Type, List, Iterator, Any
 
 import yaml
 from hbreader import FileInfo
@@ -51,3 +51,19 @@ def loads_any(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]],
         @return: instance of taarget_class
         """
         return self.load_any(source, target_class, metadata=metadata)
+
+    def iter_instances(self) -> Iterator[Any]:
+        """Lazily yield instances from YAML source.
+
+        If the root of the document is an array, yield each element of the array. Otherwise,
+        yield the root element itself. Repeat for each document in the YAML file.
+
+        :return: Iterator over data instances
+        :rtype: Iterator[Any]
+        """
+        with open(self.source) as source_file:
+            for document in yaml.safe_load_all(source_file):
+                if isinstance(document, list):
+                    yield from document
+                else:
+                    yield document