diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a3d36fa40..f4d6751fe9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Notes on development dependencies in `setup.py` files to codebase notes. - Test for `cached_download` ### Changed +- `repo`/`Repo` to `record`/`Record` - Definitions with a `spec` can use the `subspec` parameter to declare that they are a list or a dict where the values are of the `spec` type. Rather than the list or dict itself being of the `spec` type. diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index fbbf81b68e..a48affcdfc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -7,7 +7,7 @@ import pkg_resources from ..version import VERSION -from ..repo import Repo +from ..record import Record from ..source.source import BaseSource from ..util.packaging import is_develop from ..util.cli.arg import Arg @@ -37,28 +37,30 @@ async def run(self): class Edit(SourcesCMD, KeysCMD): """ - Edit each specified repo + Edit each specified record """ async def run(self): async with self.sources as sources: async with sources() as sctx: for key in self.keys: - repo = await sctx.repo(key) + record = await sctx.record(key) pdb.set_trace() - await sctx.update(repo) + await sctx.update(record) class Merge(CMD): """ - Merge repo data between sources + Merge record data between sources """ arg_dest = Arg( - "dest", help="Sources merge repos into", type=BaseSource.load_labeled + "dest", help="Sources merge records into", type=BaseSource.load_labeled ) arg_src = Arg( - "src", help="Sources to pull repos from", type=BaseSource.load_labeled + "src", + help="Sources to pull records from", + type=BaseSource.load_labeled, ) async def run(self): @@ -66,11 +68,11 @@ async def run(self): self.extra_config ) as src, self.dest.withconfig(self.extra_config) as dest: async with src() as sctx, dest() as dctx: - async for src in sctx.repos(): - repo = Repo(src.key) - repo.merge(src) - repo.merge(await dctx.repo(repo.key)) - await dctx.update(repo) + async for src in sctx.records(): + record = Record(src.key) + record.merge(src) + record.merge(await dctx.record(record.key)) + await dctx.update(record) class ImportExportCMD(PortCMD, SourcesCMD): @@ -80,7 +82,7 @@ class ImportExportCMD(PortCMD, SourcesCMD): class Import(ImportExportCMD): - """Imports repos""" + """Imports records""" async def run(self): async with self.sources as sources: @@ -89,7 +91,7 @@ async def run(self): class Export(ImportExportCMD): - """Exports repos""" + """Exports records""" async def run(self): async with self.sources as sources: diff --git a/dffml/cli/dataflow.py b/dffml/cli/dataflow.py index 5971a1eecf..3683fd813b 100644 --- a/dffml/cli/dataflow.py +++ b/dffml/cli/dataflow.py @@ -97,14 +97,14 @@ class RunCMD(SourcesCMD): arg_sources = SourcesCMD.arg_sources.modify(required=False) arg_caching = Arg( "-caching", - help="Skip running DataFlow if a repo already contains these features", + help="Skip running DataFlow if a record already contains these features", nargs="+", required=False, default=[], ) arg_no_update = Arg( "-no-update", - help="Update repo with sources", + help="Update record with sources", required=False, default=False, action="store_true", @@ -134,16 +134,16 @@ class RunCMD(SourcesCMD): nargs="+", action=ParseInputsAction, default=[], - help="Other inputs to add under each ctx (repo's key will " + help="Other inputs to add under each ctx (record's key will " + "be used as the context)", ) - arg_repo_def = Arg( - "-repo-def", + arg_record_def = Arg( + "-record-def", default=False, type=str, - help="Definition to be used for repo.key." - + "If set, repo.key will be added to the set of inputs " - + "under each context (which is also the repo's key)", + help="Definition to be used for record.key." + + "If set, record.key will be added to the set of inputs " + + "under each context (which is also the record's key)", ) def __init__(self, *args, **kwargs): @@ -151,25 +151,25 @@ def __init__(self, *args, **kwargs): self.orchestrator = self.orchestrator.withconfig(self.extra_config) -class RunAllRepos(RunCMD): - """Run dataflow for all repos in sources""" +class RunAllRecords(RunCMD): + """Run dataflow for all records in sources""" - async def repos(self, sctx): + async def records(self, sctx): """ - This method exists so that it can be overriden by RunRepoSet + This method exists so that it can be overriden by RunRecordSet """ - async for repo in sctx.repos(): - yield repo + async for record in sctx.records(): + yield record async def run_dataflow(self, orchestrator, sources, dataflow): # Orchestrate the running of these operations async with orchestrator(dataflow) as octx, sources() as sctx: # Add our inputs to the input network with the context being the - # repo key + # record key inputs = [] - async for repo in self.repos(sctx): - # Skip running DataFlow if repo already has features - existing_features = repo.features() + async for record in self.records(sctx): + # Skip running DataFlow if record already has features + existing_features = record.features() if self.caching and all( map( lambda cached: cached in existing_features, @@ -178,19 +178,19 @@ async def run_dataflow(self, orchestrator, sources, dataflow): ): continue - repo_inputs = [] + record_inputs = [] for value, def_name in self.inputs: - repo_inputs.append( + record_inputs.append( Input( value=value, definition=dataflow.definitions[def_name], ) ) - if self.repo_def: - repo_inputs.append( + if self.record_def: + record_inputs.append( Input( - value=repo.key, - definition=dataflow.definitions[self.repo_def], + value=record.key, + definition=dataflow.definitions[self.record_def], ) ) @@ -199,8 +199,8 @@ async def run_dataflow(self, orchestrator, sources, dataflow): inputs.append( MemoryInputSet( MemoryInputSetConfig( - ctx=StringInputSetContext(repo.key), - inputs=repo_inputs, + ctx=StringInputSetContext(record.key), + inputs=record_inputs, ) ) ) @@ -212,14 +212,14 @@ async def run_dataflow(self, orchestrator, sources, dataflow): *inputs, strict=not self.no_strict ): ctx_str = (await ctx.handle()).as_string() - # TODO(p4) Make a RepoInputSetContext which would let us - # store the repo instead of recalling it by the URL - repo = await sctx.repo(ctx_str) + # TODO(p4) Make a RecordInputSetContext which would let us + # store the record instead of recalling it by the URL + record = await sctx.record(ctx_str) # Store the results - repo.evaluated(results) - yield repo + record.evaluated(results) + yield record if not self.no_update: - await sctx.update(repo) + await sctx.update(record) async def run(self): dataflow_path = pathlib.Path(self.dataflow) @@ -232,35 +232,35 @@ async def run(self): exported = await loader.loadb(dataflow_path.read_bytes()) dataflow = DataFlow._fromdict(**exported) async with self.orchestrator as orchestrator, self.sources as sources: - async for repo in self.run_dataflow( + async for record in self.run_dataflow( orchestrator, sources, dataflow ): - yield repo + yield record -class RunRepoSet(RunAllRepos, KeysCMD): - """Run dataflow for single repo or set of repos""" +class RunRecordSet(RunAllRecords, KeysCMD): + """Run dataflow for single record or set of records""" - async def repos(self, sctx): + async def records(self, sctx): for key in self.keys: - yield await sctx.repo(key) + yield await sctx.record(key) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.sources = SubsetSources(*self.sources, keys=self.keys) -class RunRepos(CMD): - """Run DataFlow and assign output to a repo""" +class RunRecords(CMD): + """Run DataFlow and assign output to a record""" - _set = RunRepoSet - _all = RunAllRepos + _set = RunRecordSet + _all = RunAllRecords class Run(CMD): """Run dataflow""" - repos = RunRepos + records = RunRecords class Diagram(CMD): diff --git a/dffml/cli/list.py b/dffml/cli/list.py index edbf4ae11f..9708b648b7 100644 --- a/dffml/cli/list.py +++ b/dffml/cli/list.py @@ -6,16 +6,16 @@ from ..util.cli.cmds import SourcesCMD, ListEntrypoint -class ListRepos(SourcesCMD): +class ListRecords(SourcesCMD): """ - List repos stored in sources + List records stored in sources """ async def run(self): async with self.sources as sources: async with sources() as sctx: - async for repo in sctx.repos(): - print(repo) + async for record in sctx.records(): + print(record) class ListServices(ListEntrypoint): @@ -48,10 +48,10 @@ class ListModels(ListEntrypoint): class List(CMD): """ - List repos and installed interfaces + List records and installed interfaces """ - repos = ListRepos + records = ListRecords sources = ListSources models = ListModels services = ListServices diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 1c0079368b..947ba6bac1 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -38,21 +38,21 @@ class PredictAll(MLCMD): arg_update = Arg( "-update", - help="Update repo with sources", + help="Update record with sources", required=False, default=False, action="store_true", ) async def run(self): - async for repo in predict( - self.model, self.sources, update=self.update, keep_repo=True + async for record in predict( + self.model, self.sources, update=self.update, keep_record=True ): - yield repo + yield record -class PredictRepo(PredictAll, KeysCMD): - """Predictions for individual repos""" +class PredictRecord(PredictAll, KeysCMD): + """Predictions for individual records""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -60,7 +60,7 @@ def __init__(self, *args, **kwargs): class Predict(CMD): - """Evaluate features against repos and produce a prediction""" + """Evaluate features against records and produce a prediction""" - repo = PredictRepo + record = PredictRecord _all = PredictAll diff --git a/dffml/feature/feature.py b/dffml/feature/feature.py index 43976320ac..f9b4d23efc 100644 --- a/dffml/feature/feature.py +++ b/dffml/feature/feature.py @@ -63,7 +63,7 @@ async def inc(self, key, default=None, by=1): class Data(Task): """ - Passed to each feature during evaluation. Shared between all features a repo + Passed to each feature during evaluation. Shared between all features a record is being evaluated with """ diff --git a/dffml/high_level.py b/dffml/high_level.py index 579b358187..33020a20c3 100644 --- a/dffml/high_level.py +++ b/dffml/high_level.py @@ -5,16 +5,16 @@ import pathlib from typing import Union, Dict, Any -from .repo import Repo +from .record import Record from .source.source import Sources, BaseSource from .source.memory import MemorySource, MemorySourceConfig -def _repos_to_sources(*args): +def _records_to_sources(*args): """ - Create a memory source out of any repos passed as a variable length list. + Create a memory source out of any records passed as a variable length list. Add all sources found in the variable length list to a list of sources, and - the created source containing repos, and return that list of sources. + the created source containing records, and return that list of sources. """ # If the first arg is an instance of sources, append the rest to that. if args and isinstance(args[0], Sources): @@ -23,27 +23,27 @@ def _repos_to_sources(*args): sources = Sources( *[arg for arg in args if isinstance(arg, BaseSource)] ) - # Repos to add to memory source - repos = [] + # Records to add to memory source + records = [] # Make args mutable args = list(args) - # Convert dicts to repos + # Convert dicts to records for i, arg in enumerate(args): if isinstance(arg, dict): - arg = Repo(i, data={"features": arg}) - if isinstance(arg, Repo): - repos.append(arg) + arg = Record(i, data={"features": arg}) + if isinstance(arg, Record): + records.append(arg) if isinstance(arg, str) and "." in arg: filepath = pathlib.Path(arg) source = BaseSource.load(filepath.suffix.replace(".", "")) sources.append(source(filename=arg)) - # Create memory source if there are any repos - if repos: - sources.append(MemorySource(MemorySourceConfig(repos=repos))) + # Create memory source if there are any records + if records: + sources.append(MemorySource(MemorySourceConfig(records=records))) return sources -async def train(model, *args: Union[BaseSource, Repo, Dict[str, Any]]): +async def train(model, *args: Union[BaseSource, Record, Dict[str, Any]]): """ Train a machine learning model. @@ -56,18 +56,18 @@ async def train(model, *args: Union[BaseSource, Repo, Dict[str, Any]]): Machine Learning model to use. See :doc:`/plugins/dffml_model` for models options. *args : list - Input data for training. Could be a ``dict``, :py:class:`Repo`, + Input data for training. Could be a ``dict``, :py:class:`Record`, filename, one of the data :doc:`/plugins/dffml_source`, or a filename with the extension being one of the data sources. """ - sources = _repos_to_sources(*args) + sources = _records_to_sources(*args) async with sources as sources, model as model: async with sources() as sctx, model() as mctx: return await mctx.train(sctx) async def accuracy( - model, *args: Union[BaseSource, Repo, Dict[str, Any]] + model, *args: Union[BaseSource, Record, Dict[str, Any]] ) -> float: """ Assess the accuracy of a machine learning model. @@ -81,7 +81,7 @@ async def accuracy( Machine Learning model to use. See :doc:`/plugins/dffml_model` for models options. *args : list - Input data for training. Could be a ``dict``, :py:class:`Repo`, + Input data for training. Could be a ``dict``, :py:class:`Record`, filename, one of the data :doc:`/plugins/dffml_source`, or a filename with the extension being one of the data sources. @@ -92,7 +92,7 @@ async def accuracy( correct prediction. For some models this has another meaning. Please see the documentation for the model your using for further details. """ - sources = _repos_to_sources(*args) + sources = _records_to_sources(*args) async with sources as sources, model as model: async with sources() as sctx, model() as mctx: return float(await mctx.accuracy(sctx)) @@ -100,9 +100,9 @@ async def accuracy( async def predict( model, - *args: Union[BaseSource, Repo, Dict[str, Any]], + *args: Union[BaseSource, Record, Dict[str, Any]], update: bool = False, - keep_repo: bool = False, + keep_record: bool = False, ): """ Make a prediction using a machine learning model. @@ -115,29 +115,29 @@ async def predict( Machine Learning model to use. See :doc:`/plugins/dffml_model` for models options. *args : list - Input data for prediction. Could be a ``dict``, :py:class:`Repo`, + Input data for prediction. Could be a ``dict``, :py:class:`Record`, filename, or one of the data :doc:`/plugins/dffml_source`. update : boolean, optional If ``True`` prediction data within records will be written back to all sources given. Defaults to ``False``. - keep_repo : boolean, optional - If ``True`` the results will be kept as their ``Repo`` objects instead - of being converted to a ``(repo.key, features, predictions)`` tuple. + keep_record : boolean, optional + If ``True`` the results will be kept as their ``Record`` objects instead + of being converted to a ``(record.key, features, predictions)`` tuple. Defaults to ``False``. Returns ------- asynciterator - ``Repo`` objects or ``(repo.key, features, predictions)`` tuple. + ``Record`` objects or ``(record.key, features, predictions)`` tuple. """ - sources = _repos_to_sources(*args) + sources = _records_to_sources(*args) async with sources as sources, model as model: async with sources() as sctx, model() as mctx: - async for repo in mctx.predict(sctx.repos()): - yield repo if keep_repo else ( - repo.key, - repo.features(), - repo.predictions(), + async for record in mctx.predict(sctx.records()): + yield record if keep_record else ( + record.key, + record.features(), + record.predictions(), ) if update: - await sctx.update(repo) + await sctx.update(record) diff --git a/dffml/model/model.py b/dffml/model/model.py index ec0a4afc08..f61021a07a 100644 --- a/dffml/model/model.py +++ b/dffml/model/model.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Model subclasses are responsible for training themselves on repos, making -predictions about the value of a feature in the repo, and assessing thei +Model subclasses are responsible for training themselves on records, making +predictions about the value of a feature in the record, and assessing thei prediction accuracy. """ import os @@ -14,7 +14,7 @@ BaseDataFlowFacilitatorObjectContext, BaseDataFlowFacilitatorObject, ) -from ..repo import Repo +from ..record import Record from ..source.source import Sources from ..feature import Features from .accuracy import Accuracy @@ -43,25 +43,27 @@ def __init__(self, parent: "Model") -> None: @abc.abstractmethod async def train(self, sources: Sources): """ - Train using repos as the data to learn from. + Train using records as the data to learn from. """ raise NotImplementedError() @abc.abstractmethod async def accuracy(self, sources: Sources) -> Accuracy: """ - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. """ raise NotImplementedError() @abc.abstractmethod - async def predict(self, repos: AsyncIterator[Repo]) -> AsyncIterator[Repo]: + async def predict( + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Record]: """ - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. """ raise NotImplementedError() - yield (Repo(""), "", 0.0) # skipcq: PYL-W0101 + yield (Record(""), "", 0.0) # skipcq: PYL-W0101 @base_entry_point("dffml.model", "model") diff --git a/dffml/operation/model.py b/dffml/operation/model.py index 13f5d47009..30a26f43e4 100644 --- a/dffml/operation/model.py +++ b/dffml/operation/model.py @@ -1,6 +1,6 @@ from typing import Dict, Any -from ..repo import Repo +from ..record import Record from ..base import config from ..model import Model from ..df.types import Definition @@ -22,7 +22,7 @@ def __post_init__(self): name="dffml.model.predict", inputs={ "features": Definition( - name="repo_features", primitive="Dict[str, Any]" + name="record_features", primitive="Dict[str, Any]" ) }, outputs={ @@ -35,8 +35,8 @@ def __post_init__(self): ctx_enter={"mctx": (lambda self: self.parent.model())}, ) async def model_predict(self, features: Dict[str, Any]) -> Dict[str, Any]: - async def repos(): - yield Repo("", data={"features": features}) + async def records(): + yield Record("", data={"features": features}) - async for repo in self.mctx.predict(repos()): - return {"prediction": repo.predictions()} + async for record in self.mctx.predict(records()): + return {"prediction": record.predictions()} diff --git a/dffml/port/json.py b/dffml/port/json.py index 927c95449a..56d38c55cf 100644 --- a/dffml/port/json.py +++ b/dffml/port/json.py @@ -1,23 +1,25 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Ports repos in JSON format +Ports records in JSON format """ import json from .port import Port -from ..repo import Repo +from ..record import Record from ..source.source import BaseSourceContext class JSON(Port): """ - Imports and exports repos in JSON format + Imports and exports records in JSON format """ async def export_fd(self, sctx: BaseSourceContext, fd): - json.dump({repo.key: repo.dict() async for repo in sctx.repos()}, fd) + json.dump( + {record.key: record.dict() async for record in sctx.record()}, fd + ) async def import_fd(self, sctx: BaseSourceContext, fd): for key, data in json.load(fd): - await sctx.update(Repo(key, data=data)) + await sctx.update(Record(key, data=data)) diff --git a/dffml/port/port.py b/dffml/port/port.py index a834c13e54..77e37c9fc1 100644 --- a/dffml/port/port.py +++ b/dffml/port/port.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Port subclasses import and export repos. +Port subclasses import and export records. """ import abc @@ -11,7 +11,7 @@ class Port(abc.ABC, Entrypoint): """ - Port repos into the format the porter understands + Port records into the format the porter understands """ ENTRYPOINT = "dffml.port" @@ -19,13 +19,13 @@ class Port(abc.ABC, Entrypoint): @abc.abstractmethod async def export_fd(self, source: BaseSource, fd): """ - Export repos + Export records """ @abc.abstractmethod async def import_fd(self, source: BaseSource, fd): """ - Import repos + Import records """ async def export_to_file(self, source: BaseSource, filename: str): diff --git a/dffml/repo.py b/dffml/record.py similarity index 87% rename from dffml/repo.py rename to dffml/record.py index 1236d96756..03dd1db299 100644 --- a/dffml/repo.py +++ b/dffml/record.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Information on the software to evaluate is stored in a Repo instance. +Information on the software to evaluate is stored in a Record instance. """ from datetime import datetime from typing import Optional, List, Dict, Any @@ -9,14 +9,14 @@ from .util.data import merge from .log import LOGGER -LOGGER = LOGGER.getChild("repo") +LOGGER = LOGGER.getChild("record") class NoSuchFeature(KeyError): pass # pragma: no cov -class RepoPrediction(dict): +class RecordPrediction(dict): EXPORTED = ["value", "confidence"] @@ -48,7 +48,7 @@ def __bool__(self): __nonzero__ = __bool__ -class RepoData(object): +class RecordData(object): DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ" EXPORTED = ["key", "features", "prediction"] @@ -61,7 +61,7 @@ def __init__( prediction: Optional[Dict[str, Any]] = None, last_updated: Optional[datetime] = None, ) -> None: - # If the repo is not evaluated or predicted then don't report out a new + # If the record is not evaluated or predicted then don't report out a new # value for last_updated self.last_updated_default = datetime.now() if key is None: @@ -75,7 +75,7 @@ def __init__( if isinstance(last_updated, str): last_updated = datetime.strptime(last_updated, self.DATE_FORMAT) for _key, _val in prediction.items(): - prediction[_key] = RepoPrediction(**_val) + prediction[_key] = RecordPrediction(**_val) self.key = key self.features = features self.prediction = prediction @@ -101,12 +101,12 @@ def __repr__(self): return str(self.dict()) -class Repo(object): +class Record(object): """ - Manages feature independent information and actions for a repo. + Manages feature independent information and actions for a record. """ - REPO_DATA = RepoData + RECORD_DATA = RecordData def __init__( self, @@ -125,7 +125,7 @@ def __init__( data["extra"].update(extra) extra = data["extra"] del data["extra"] - self.data = self.REPO_DATA(**data) + self.data = self.RECORD_DATA(**data) self.extra = extra def dict(self): @@ -169,11 +169,11 @@ def __str__(self): ) ).rstrip() - def merge(self, repo: "Repo"): + def merge(self, record: "Record"): data = self.data.dict() - merge(data, repo.data.dict()) - self.data = self.REPO_DATA(**data) - self.extra.update(repo.extra) # type: ignore + merge(data, record.data.dict()) + self.data = self.RECORD_DATA(**data) + self.extra.update(record.extra) # type: ignore @property def key(self) -> str: @@ -192,7 +192,7 @@ def evaluated(self, results: Dict[str, Any], overwrite=False): def features(self, subset: List[str] = []) -> Dict[str, Any]: """ - Returns all features for the repo or the subset specified. + Returns all features for the record or the subset specified. """ if not subset: return self.data.features @@ -206,7 +206,7 @@ def features(self, subset: List[str] = []) -> Dict[str, Any]: def feature(self, name: str) -> Any: """ - Returns a feature of the repo. + Returns a feature of the record. """ if name not in self.data.features: raise NoSuchFeature(name) @@ -214,16 +214,16 @@ def feature(self, name: str) -> Any: def predicted(self, target: str, value: Any, confidence: float): """ - Set the prediction for this repo + Set the prediction for this record """ - self.data.prediction[target] = RepoPrediction( + self.data.prediction[target] = RecordPrediction( value=value, confidence=float(confidence) ) self.data.last_updated = datetime.now() - def prediction(self, target: str) -> RepoPrediction: + def prediction(self, target: str) -> RecordPrediction: """ - Get the prediction for this repo + Get the prediction for this record """ return self.data.prediction[target] diff --git a/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py b/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py index 7027e3b8d7..07bfac3604 100644 --- a/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py +++ b/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py @@ -5,7 +5,7 @@ """ from typing import AsyncIterator, Tuple, Any, List -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.feature import Features from dffml.model.accuracy import Accuracy @@ -30,27 +30,27 @@ class MiscModelContext(ModelContext): async def train(self, sources: Sources): """ - Train using repos as the data to learn from. + Train using records as the data to learn from. """ pass async def accuracy(self, sources: Sources) -> Accuracy: """ - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. """ # Lies return 1.0 async def predict( - self, repos: AsyncIterator[Repo] - ) -> AsyncIterator[Tuple[Repo, Any, float]]: + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Tuple[Record, Any, float]]: """ - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. """ - async for repo in repos: - yield repo, self.parent.config.classifications[ - repo.feature(self.parent.config.features.names()[0]) + async for record in records: + yield record, self.parent.config.classifications[ + record.feature(self.parent.config.features.names()[0]) ], 1.0 diff --git a/dffml/skel/model/tests/test_model.py b/dffml/skel/model/tests/test_model.py index 385ec7159e..f0ee9ddc0e 100644 --- a/dffml/skel/model/tests/test_model.py +++ b/dffml/skel/model/tests/test_model.py @@ -2,7 +2,7 @@ import tempfile from typing import Type -from dffml.repo import Repo, RepoData +from dffml.record import Record, RecordData from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import Data, Feature, Features @@ -38,22 +38,22 @@ def setUpClass(cls): features=cls.features, ) ) - cls.repos = [ - Repo( + cls.records = [ + Record( "a" + str(random.random()), data={"features": {cls.feature.NAME: 1, "string": "a"}}, ) for _ in range(0, 1000) ] - cls.repos += [ - Repo( + cls.records += [ + Record( "b" + str(random.random()), data={"features": {cls.feature.NAME: 0, "string": "not a"}}, ) for _ in range(0, 1000) ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) @classmethod @@ -72,17 +72,17 @@ async def test_01_accuracy(self): self.assertGreater(res, 0.9) async def test_02_predict(self): - a = Repo("a", data={"features": {self.feature.NAME: 1}}) - b = Repo("not a", data={"features": {self.feature.NAME: 0}}) + a = Record("a", data={"features": {self.feature.NAME: 1}}) + b = Record("not a", data={"features": {self.feature.NAME: 0}}) async with Sources( - MemorySource(MemorySourceConfig(repos=[a, b])) + MemorySource(MemorySourceConfig(records=[a, b])) ) as sources, self.model as model: async with sources() as sctx, model() as mctx: num = 0 - async for repo, prediction, confidence in mctx.predict( - sctx.repos() + async for record, prediction, confidence in mctx.predict( + sctx.records() ): - with self.subTest(repo=repo): - self.assertEqual(prediction, repo.key) + with self.subTest(record=record): + self.assertEqual(prediction, record.key) num += 1 self.assertEqual(num, 2) diff --git a/dffml/skel/source/REPLACE_IMPORT_PACKAGE_NAME/misc.py b/dffml/skel/source/REPLACE_IMPORT_PACKAGE_NAME/misc.py index 604d544cc2..f5cc704eaa 100644 --- a/dffml/skel/source/REPLACE_IMPORT_PACKAGE_NAME/misc.py +++ b/dffml/skel/source/REPLACE_IMPORT_PACKAGE_NAME/misc.py @@ -1,7 +1,7 @@ from typing import AsyncIterator, Dict, List from dffml.base import BaseConfig -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import BaseSourceContext, BaseSource from dffml.util.cli.arg import Arg from dffml.util.entrypoint import entrypoint @@ -10,34 +10,34 @@ @config class MiscSourceConfig: - repos: List[Repo] + records: List[Record] class MiscSourceContext(BaseSourceContext): - async def update(self, repo): - self.parent.mem[repo.key] = repo + async def update(self, record): + self.parent.mem[record.key] = record - async def repos(self) -> AsyncIterator[Repo]: - for repo in self.parent.mem.values(): - yield repo + async def records(self) -> AsyncIterator[Record]: + for record in self.parent.mem.values(): + yield record - async def repo(self, key: str) -> Repo: - return self.parent.mem.get(key, Repo(key)) + async def record(self, key: str) -> Record: + return self.parent.mem.get(key, Record(key)) @entrypoint("misc") class MiscSource(BaseSource): """ - Stores repos ... somewhere! (skeleton template is in memory) + Stores records ... somewhere! (skeleton template is in memory) """ CONTEXT = MiscSourceContext def __init__(self, config: BaseConfig) -> None: super().__init__(config) - self.mem: Dict[str, Repo] = {} + self.mem: Dict[str, Record] = {} if isinstance(self.config, MiscSourceConfig): - self.mem = {repo.key: repo for repo in self.config.repos} + self.mem = {record.key: record for record in self.config.records} @classmethod def args(cls, args, *above) -> Dict[str, Arg]: @@ -49,5 +49,5 @@ def args(cls, args, *above) -> Dict[str, Arg]: @classmethod def config(cls, config, *above): return MiscSourceConfig( - repos=list(map(Repo, cls.config_get(config, above, "keys"))) + records=list(map(Record, cls.config_get(config, above, "keys"))) ) diff --git a/dffml/skel/source/tests/test_source.py b/dffml/skel/source/tests/test_source.py index c8de471a49..b712accc65 100644 --- a/dffml/skel/source/tests/test_source.py +++ b/dffml/skel/source/tests/test_source.py @@ -6,4 +6,4 @@ class TestMiscSource(SourceTest, AsyncTestCase): async def setUpSource(self): - return MiscSource(MiscSourceConfig(repos=[])) + return MiscSource(MiscSourceConfig(records=[])) diff --git a/dffml/source/csv.py b/dffml/source/csv.py index dd10f2bc05..c8558443f4 100644 --- a/dffml/source/csv.py +++ b/dffml/source/csv.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Loads repos from a csv file, using columns as features +Loads records from a csv file, using columns as features """ import csv import ast @@ -11,7 +11,7 @@ from dataclasses import dataclass from contextlib import asynccontextmanager -from ..repo import Repo +from ..record import Record from .memory import MemorySource from .file import FileSource, FileSourceConfig from ..base import config @@ -54,12 +54,12 @@ class CSVSourceConfig(FileSourceConfig): @entrypoint("csv") class CSVSource(FileSource, MemorySource): """ - Uses a CSV file as the source of repo feature data + Uses a CSV file as the source of record feature data """ CONFIG = CSVSourceConfig - # Headers we've added to track data other than feature data for a repo + # Headers we've added to track data other than feature data for a record CSV_HEADERS = ["prediction", "confidence"] OPEN_CSV_FILES: Dict[str, OpenCSVFile] = {} @@ -92,7 +92,7 @@ async def read_csv(self, fd, open_file): open_file.write_back_key = False if self.config.tagcol in dict_reader.fieldnames: open_file.write_back_tag = True - # Store all the repos by their tag in write_out + # Store all the records by their tag in write_out open_file.write_out = {} # If there is no key track row index to be used as key by tag index = {} @@ -108,9 +108,9 @@ async def read_csv(self, fd, open_file): del row[self.config.key] else: index[tag] += 1 - # Repo data we are going to parse from this row (must include + # Record data we are going to parse from this row (must include # features). - repo_data = {} + record_data = {} # Parse headers we as the CSV source added csv_meta = {} row_keys = [] @@ -139,7 +139,7 @@ async def read_csv(self, fd, open_file): except (SyntaxError, ValueError): features[_key] = _value if features: - repo_data["features"] = features + record_data["features"] = features # Getting all prediction target names target_keys = filter( @@ -156,17 +156,17 @@ async def read_csv(self, fd, open_file): } for target_name in target_keys } - repo_data.update({"prediction": predictions}) + record_data.update({"prediction": predictions}) # If there was no data in the row, skip it - if not repo_data and key == str(index[tag] - 1): + if not record_data and key == str(index[tag] - 1): continue - # Add the repo to our internal memory representation + # Add the record to our internal memory representation open_file.write_out.setdefault(tag, {}) - open_file.write_out[tag][key] = Repo(key, data=repo_data) + open_file.write_out[tag][key] = Record(key, data=record_data) async def load_fd(self, fd): """ - Parses a CSV stream into Repo instances + Parses a CSV stream into Record instances """ async with self._open_csv(fd) as open_file: self.mem = open_file.write_out.get(self.config.tag, {}) @@ -191,10 +191,10 @@ async def dump_fd(self, fd): # Get all the feature names feature_fieldnames = set() prediction_fieldnames = set() - for tag, repos in open_file.write_out.items(): - for repo in repos.values(): - feature_fieldnames |= set(repo.data.features.keys()) - prediction_fieldnames |= set(repo.data.prediction.keys()) + for tag, records in open_file.write_out.items(): + for record in records.values(): + feature_fieldnames |= set(record.data.features.keys()) + prediction_fieldnames |= set(record.data.prediction.keys()) fieldnames += list(feature_fieldnames) fieldnames += itertools.chain( *list( @@ -208,21 +208,21 @@ async def dump_fd(self, fd): # Write out the file writer = csv.DictWriter(fd, fieldnames=fieldnames) writer.writeheader() - for tag, repos in open_file.write_out.items(): - for repo in repos.values(): - repo_data = repo.dict() + for tag, records in open_file.write_out.items(): + for record in records.values(): + record_data = record.dict() row = {name: "" for name in fieldnames} # Always write the tag row[self.config.tagcol] = tag # Write the key if it existed if open_file.write_back_key: - row[self.config.key] = repo.key + row[self.config.key] = record.key # Write the features - for key, value in repo_data.get("features", {}).items(): + for key, value in record_data.get("features", {}).items(): row[key] = value # Write the prediction - if "prediction" in repo_data: - for key, value in repo_data["prediction"].items(): + if "prediction" in record_data: + for key, value in record_data["prediction"].items(): row["prediction_" + key] = value["value"] row["confidence_" + key] = value["confidence"] writer.writerow(row) diff --git a/dffml/source/idx1.py b/dffml/source/idx1.py index e5dc2d3024..8e2c417610 100644 --- a/dffml/source/idx1.py +++ b/dffml/source/idx1.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Loads repos from an IDX1 file +Loads records from an IDX1 file """ import struct -from ..repo import Repo +from ..record import Record from ..base import config, field from .memory import MemorySource from .file import BinaryFileSource @@ -39,7 +39,7 @@ async def load_fd(self, xfile): # Reading the rest of binary datafile one byte at a time self.mem = {} for i in range(size): - self.mem[str(i)] = Repo( + self.mem[str(i)] = Record( str(i), data={ "features": { diff --git a/dffml/source/idx3.py b/dffml/source/idx3.py index fea1565720..dc681a6b7c 100644 --- a/dffml/source/idx3.py +++ b/dffml/source/idx3.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2019 Intel Corporation """ -Loads repos from an IDX3 file +Loads records from an IDX3 file """ import struct -from ..repo import Repo +from ..record import Record from ..util.entrypoint import entrypoint from .idx1 import IDX1Source, IDXSourceConfig @@ -30,7 +30,7 @@ async def load_fd(self, xfile): self.mem = {} inner_array_size = nrows * ncols for i in range(0, size): - self.mem[str(i)] = Repo( + self.mem[str(i)] = Record( str(i), data={ "features": { diff --git a/dffml/source/json.py b/dffml/source/json.py index 0b531a3b29..f283e86eac 100644 --- a/dffml/source/json.py +++ b/dffml/source/json.py @@ -6,7 +6,7 @@ from contextlib import asynccontextmanager from typing import Dict -from ..repo import Repo +from ..record import Record from .memory import MemorySource from .file import FileSource, FileSourceConfig from ..util.entrypoint import entrypoint @@ -70,22 +70,22 @@ async def _empty_file_init(self): async def load_fd(self, fd): async with self._open_json(fd): - repos = self.OPEN_JSON_FILES[self.config.filename].data + records = self.OPEN_JSON_FILES[self.config.filename].data self.mem = { - key: Repo(key, data=data) - for key, data in repos.get(self.config.tag, {}).items() + key: Record(key, data=data) + for key, data in records.get(self.config.tag, {}).items() } LOGGER.debug("%r loaded %d records", self, len(self.mem)) async def dump_fd(self, fd): async with self.OPEN_JSON_FILES_LOCK: - repos = self.OPEN_JSON_FILES[self.config.filename].data - repos[self.config.tag] = { - repo.key: repo.dict() for repo in self.mem.values() + records = self.OPEN_JSON_FILES[self.config.filename].data + records[self.config.tag] = { + record.key: record.dict() for record in self.mem.values() } self.logger.debug(f"{self.config.filename} updated") if await self.OPEN_JSON_FILES[self.config.filename].dec(): del self.OPEN_JSON_FILES[self.config.filename] - json.dump(repos, fd) + json.dump(records, fd) self.logger.debug(f"{self.config.filename} written") LOGGER.debug("%r saved %d records", self, len(self.mem)) diff --git a/dffml/source/memory.py b/dffml/source/memory.py index 99ad353ef5..7c5295cad5 100644 --- a/dffml/source/memory.py +++ b/dffml/source/memory.py @@ -6,32 +6,32 @@ from typing import Dict, List, AsyncIterator from ..base import config -from ..repo import Repo +from ..record import Record from .source import BaseSourceContext, BaseSource from ..util.entrypoint import entrypoint class MemorySourceContext(BaseSourceContext): - async def update(self, repo): - self.parent.mem[repo.key] = repo + async def update(self, record): + self.parent.mem[record.key] = record - async def repos(self) -> AsyncIterator[Repo]: - for repo in self.parent.mem.values(): - yield repo + async def records(self) -> AsyncIterator[Record]: + for record in self.parent.mem.values(): + yield record - async def repo(self, key: str) -> Repo: - return self.parent.mem.get(key, Repo(key)) + async def record(self, key: str) -> Record: + return self.parent.mem.get(key, Record(key)) @config class MemorySourceConfig: - repos: List[Repo] + records: List[Record] @entrypoint("memory") class MemorySource(BaseSource): """ - Stores repos in a dict in memory + Stores records in a dict in memory """ CONFIG = MemorySourceConfig @@ -39,6 +39,6 @@ class MemorySource(BaseSource): def __init__(self, config: MemorySourceConfig) -> None: super().__init__(config) - self.mem: Dict[str, Repo] = {} + self.mem: Dict[str, Record] = {} if isinstance(self.config, MemorySourceConfig): - self.mem = {repo.key: repo for repo in self.config.repos} + self.mem = {record.key: record for record in self.config.records} diff --git a/dffml/source/source.py b/dffml/source/source.py index 79fd9de3a5..79ea0c8bb4 100644 --- a/dffml/source/source.py +++ b/dffml/source/source.py @@ -11,7 +11,7 @@ BaseDataFlowFacilitatorObjectContext, BaseDataFlowFacilitatorObject, ) -from ..repo import Repo +from ..record import Record from ..util.asynchelper import ( AsyncContextManagerListContext, AsyncContextManagerList, @@ -26,23 +26,23 @@ def __init__(self, parent: "BaseSource") -> None: self.parent = parent @abc.abstractmethod - async def update(self, repo: Repo): + async def update(self, record: Record): """ - Updates a repo for a source + Updates a record for a source """ @abc.abstractmethod - async def repos(self) -> AsyncIterator[Repo]: + async def records(self) -> AsyncIterator[Record]: """ - Returns a list of repos retrieved from self.src + Returns a list of records retrieved from self.src """ - # mypy ignores AsyncIterator[Repo], therefore this is needed - yield Repo("") # pragma: no cover + # mypy ignores AsyncIterator[Record], therefore this is needed + yield Record("") # pragma: no cover @abc.abstractmethod - async def repo(self, key: str): + async def record(self, key: str): """ - Get a repo from the source or add it if it doesn't exist + Get a record from the source or add it if it doesn't exist """ @@ -50,7 +50,7 @@ async def repo(self, key: str): class BaseSource(BaseDataFlowFacilitatorObject): """ Abstract base class for all sources. New sources must be derived from this - class and implement the repos method. + class and implement the records method. """ def __call__(self) -> BaseSourceContext: @@ -58,47 +58,49 @@ def __call__(self) -> BaseSourceContext: class SourcesContext(AsyncContextManagerListContext): - async def update(self, repo: Repo): + async def update(self, record: Record): """ - Updates a repo for a source + Updates a record for a source """ - LOGGER.debug("Updating %r: %r", repo.key, repo.dict()) + LOGGER.debug("Updating %r: %r", record.key, record.dict()) for source in self: - await source.update(repo) + await source.update(record) - async def repos( - self, validation: Optional[Callable[[Repo], bool]] = None - ) -> AsyncIterator[Repo]: + async def records( + self, validation: Optional[Callable[[Record], bool]] = None + ) -> AsyncIterator[Record]: """ - Retrieves repos from all sources + Retrieves records from all sources """ for source in self: - async for repo in source.repos(): + async for record in source.records(): # NOTE In Python 3.7.3 self[1:] works, however in Python > # 3.7.3 only self.data works for other_source in self.data[1:]: - repo.merge(await other_source.repo(repo.key)) - if validation is None or validation(repo): - yield repo + record.merge(await other_source.record(record.key)) + if validation is None or validation(record): + yield record break - async def repo(self, key: str): + async def record(self, key: str): """ - Retrieve and or register repo will all sources + Retrieve and or register record will all sources """ - repo = Repo(key) + record = Record(key) for source in self: - repo.merge(await source.repo(key)) - return repo + record.merge(await source.record(key)) + return record - async def with_features(self, features: List[str]) -> AsyncIterator[Repo]: + async def with_features( + self, features: List[str] + ) -> AsyncIterator[Record]: """ - Returns all repos which have the requested features + Returns all records which have the requested features """ - async for repo in self.repos( - lambda repo: bool(repo.features(features)) + async for record in self.records( + lambda record: bool(record.features(features)) ): - yield repo + yield record class Sources(AsyncContextManagerList): @@ -107,26 +109,26 @@ class Sources(AsyncContextManagerList): class ValidationSourcesContext(SourcesContext): - async def repos( - self, validation: Optional[Callable[[Repo], bool]] = None - ) -> AsyncIterator[Repo]: - async for repo in super().repos(): - if self.parent.validation(repo) and ( - validation is None or validation(repo) + async def records( + self, validation: Optional[Callable[[Record], bool]] = None + ) -> AsyncIterator[Record]: + async for record in super().records(): + if self.parent.validation(record) and ( + validation is None or validation(record) ): - yield repo + yield record class ValidationSources(Sources): """ - Restricts access to a subset of repos during iteration based on a validation + Restricts access to a subset of records during iteration based on a validation function. """ CONTEXT = ValidationSourcesContext def __init__( - self, validation: Callable[[Repo], bool], *args: BaseSource + self, validation: Callable[[Record], bool], *args: BaseSource ) -> None: super().__init__(*args) self.validation = validation @@ -134,7 +136,7 @@ def __init__( class SubsetSources(ValidationSources): """ - Restricts access to a subset of repos during iteration based on their keys. + Restricts access to a subset of records during iteration based on their keys. """ def __init__( @@ -145,5 +147,5 @@ def __init__( keys = [] self.keys = keys - def __validation(self, repo: Repo) -> bool: - return bool(repo.key in self.keys) + def __validation(self, record: Record) -> bool: + return bool(record.key in self.keys) diff --git a/dffml/util/cli/cmd.py b/dffml/util/cli/cmd.py index 69984d2fb4..b28a779c7b 100644 --- a/dffml/util/cli/cmd.py +++ b/dffml/util/cli/cmd.py @@ -9,7 +9,7 @@ import argparse from typing import Dict, Any -from ...repo import Repo +from ...record import Record from ...feature import Feature from .arg import Arg, parse_unknown @@ -32,7 +32,7 @@ class JSONEncoder(json.JSONEncoder): def default(self, obj): typename_lower = str(type(obj)).lower() - if isinstance(obj, Repo): + if isinstance(obj, Record): return obj.dict() elif isinstance(obj, Feature): return obj.NAME diff --git a/dffml/util/testing/source.py b/dffml/util/testing/source.py index e4c8edb5a5..ccece22b5b 100644 --- a/dffml/util/testing/source.py +++ b/dffml/util/testing/source.py @@ -5,7 +5,7 @@ import random import tempfile -from ...repo import Repo, RepoPrediction +from ...record import Record, RecordPrediction from ..asynctestcase import AsyncTestCase @@ -20,7 +20,7 @@ class SourceTest(abc.ABC): >>> from dffml.util.asynctestcase import AsyncTestCase >>> class TestCustomSQliteSource(SourceTest, AsyncTestCase): >>> async def setUpSource(self): - >>> return MemorySource(MemorySourceConfig(repos=[Repo('a')])) + >>> return MemorySource(MemorySourceConfig(records=[Record('a')])) """ @abc.abstractmethod @@ -30,7 +30,7 @@ async def setUpSource(self, fileobj): async def test_update(self): full_key = "0" empty_key = "1" - full_repo = Repo( + full_record = Record( full_key, data={ "features": { @@ -40,13 +40,13 @@ async def test_update(self): "SepalWidth": 2.7, }, "prediction": { - "target_name": RepoPrediction( + "target_name": RecordPrediction( value="feedface", confidence=0.42 ) }, }, ) - empty_repo = Repo( + empty_record = Record( empty_key, data={ "features": { @@ -62,41 +62,43 @@ async def test_update(self): async with source as testSource: # Open, update, and close async with testSource() as sourceContext: - await sourceContext.update(full_repo) - await sourceContext.update(empty_repo) + await sourceContext.update(full_record) + await sourceContext.update(empty_record) async with source as testSource: # Open and confirm we saved and loaded correctly async with testSource() as sourceContext: with self.subTest(key=full_key): - repo = await sourceContext.repo(full_key) + record = await sourceContext.record(full_key) self.assertEqual( - repo.data.prediction["target_name"]["value"], + record.data.prediction["target_name"]["value"], "feedface", ) self.assertEqual( - repo.data.prediction["target_name"]["confidence"], 0.42 + record.data.prediction["target_name"]["confidence"], + 0.42, ) with self.subTest(key=empty_key): - repo = await sourceContext.repo(empty_key) + record = await sourceContext.record(empty_key) self.assertEqual( [ val["value"] - for _, val in repo.data.prediction.items() + for _, val in record.data.prediction.items() ], - ["undetermined"] * (len(repo.data.prediction)), + ["undetermined"] * (len(record.data.prediction)), ) with self.subTest(both=[full_key, empty_key]): - repos = { - repo.key: repo async for repo in sourceContext.repos() + records = { + record.key: record + async for record in sourceContext.records() } - self.assertIn(full_key, repos) - self.assertIn(empty_key, repos) + self.assertIn(full_key, records) + self.assertIn(empty_key, records) self.assertEqual( - repos[full_key].features(), full_repo.features() + records[full_key].features(), full_record.features() ) self.assertEqual( - repos[empty_key].features(), empty_repo.features() + records[empty_key].features(), empty_record.features() ) @@ -134,14 +136,14 @@ async def test_tag(self): async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: await uctx.update( - Repo("0", data={"features": {"feed": 1}}) + Record("0", data={"features": {"feed": 1}}) ) await lctx.update( - Repo("0", data={"features": {"face": 2}}) + Record("0", data={"features": {"face": 2}}) ) async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: - repo = await uctx.repo("0") - self.assertIn("feed", repo.features()) - repo = await lctx.repo("0") - self.assertIn("face", repo.features()) + record = await uctx.record("0") + self.assertIn("feed", record.features()) + record = await lctx.record("0") + self.assertIn("face", record.features()) diff --git a/docs/api/index.rst b/docs/api/index.rst index ae60c0603f..23bc3d883c 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -10,7 +10,7 @@ API Reference base df/index feature - repo + record model/index source/index db/index diff --git a/docs/api/repo.rst b/docs/api/record.rst similarity index 61% rename from docs/api/repo.rst rename to docs/api/record.rst index a1c3aa47e6..143cf5eddd 100644 --- a/docs/api/repo.rst +++ b/docs/api/record.rst @@ -1,7 +1,7 @@ -Repo +Record ==== Repository of information about a unique key. -.. autoclass:: dffml.repo.Repo +.. autoclass:: dffml.record.Record :members: diff --git a/docs/api/source/file.rst b/docs/api/source/file.rst index 26c2a6464f..6ff58c4d07 100644 --- a/docs/api/source/file.rst +++ b/docs/api/source/file.rst @@ -15,9 +15,9 @@ Supported Compressions: DFFML supports data sources in `CSV` and `JSON` formats with respective methods defined in the following classes: -1. `CSVSource`: (Inherits FileSource and MemorySource) Uses a CSV file as the source of the repo feature data. Abstract functions `load_fd` and `dump_fd` are defined. +1. `CSVSource`: (Inherits FileSource and MemorySource) Uses a CSV file as the source of the record feature data. Abstract functions `load_fd` and `dump_fd` are defined. -2. `JSONSource`: (Inherits FileSource and MemorySource) Uses a JSON file as the source of the repo feature data. Abstract functions `load_fd` and `dump_fd` are defined. +2. `JSONSource`: (Inherits FileSource and MemorySource) Uses a JSON file as the source of the record feature data. Abstract functions `load_fd` and `dump_fd` are defined. JSON ---- diff --git a/docs/api/source/index.rst b/docs/api/source/index.rst index 9d32b4d792..75c9df8538 100644 --- a/docs/api/source/index.rst +++ b/docs/api/source/index.rst @@ -5,10 +5,10 @@ Sources are how DFFML abstracts storage of feature data. This allows users to swap out their data sources as they progress from testing through integration. Most DFFML CLI commands work with the :class:`dffml.source.source.Sources` class -which merges the feature data of Repos with the same ``key``. This means +which merges the feature data of Records with the same ``key``. This means when multiple sources are given to those CLI commands, feature data stored in those various sources/databases under the same unique key will automatically -accessible within one :class:`dffml.repo.Repo`. +accessible within one :class:`dffml.record.Record`. DFFML has several built in sources which can be used programmatically or via the CLI and other services. diff --git a/docs/api/source/memory.rst b/docs/api/source/memory.rst index ee78b48af3..d99ae7be7f 100644 --- a/docs/api/source/memory.rst +++ b/docs/api/source/memory.rst @@ -1,7 +1,7 @@ Memory Source ============= -Source to store repos in a dictionary. +Source to store records in a dictionary. .. automodule:: dffml.source.memory :members: diff --git a/docs/cli.rst b/docs/cli.rst index 308e17efc4..ad7778b3d5 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -52,9 +52,9 @@ scraping tasks. .. code-block:: console - $ dffml dataflow run repos set \ + $ dffml dataflow run records set \ -keys https://github.com/intel/dffml \ - -repo-def URL \ + -record-def URL \ -dataflow df.yaml \ -sources gathered=json \ -source-filename /tmp/data.json \ diff --git a/docs/contributing/codebase.rst b/docs/contributing/codebase.rst index ecdbf59a19..887bdb4231 100644 --- a/docs/contributing/codebase.rst +++ b/docs/contributing/codebase.rst @@ -74,7 +74,7 @@ that with a database, client HTTP sessions, etc. .. code-block:: python import asyncio - from dffml.repo import Repo + from dffml.record import Record from dffml.source.csv import CSVSource, CSVSourceConfig async def main(): @@ -89,7 +89,7 @@ that with a database, client HTTP sessions, etc. # Two async with source() as sctx: # Punch - await sctx.update(Repo("0", data={ + await sctx.update(Record("0", data={ "features": { "first_column": 42, "second_column": 1776, diff --git a/docs/contributing/style.rst b/docs/contributing/style.rst index 7e6ed9e847..31a3be126c 100644 --- a/docs/contributing/style.rst +++ b/docs/contributing/style.rst @@ -30,7 +30,7 @@ Here's an example of how we style our imports. import sklearn.datasets - from dffml.repo import Repo + from dffml.record import Record from dffml.config.config import BaseConfigLoader from dffml.util.asynctestcase import AsyncTestCase, IntegrationCLITestCase diff --git a/docs/contributing/testing.rst b/docs/contributing/testing.rst index 73ce93344e..7bb4a02f12 100644 --- a/docs/contributing/testing.rst +++ b/docs/contributing/testing.rst @@ -15,7 +15,7 @@ To run a specific test, use the ``-s`` flag. .. code-block:: console - $ python3.7 setup.py test -s tests.test_cli.TestPredict.test_repo + $ python3.7 setup.py test -s tests.test_cli.TestPredict.test_record Debug Logging ------------- diff --git a/docs/plugins/dffml_model.rst b/docs/plugins/dffml_model.rst index 4570ce78b3..4bd94b41e4 100644 --- a/docs/plugins/dffml_model.rst +++ b/docs/plugins/dffml_model.rst @@ -149,12 +149,12 @@ Implemented using Tensorflow's DNNClassifier. - batchsize: Integer - default: 20 - - Number repos to pass through in an epoch + - Number records to pass through in an epoch - shuffle: Boolean - default: True - - Randomise order of repos in a batch + - Randomise order of records in a batch - steps: Integer @@ -164,7 +164,7 @@ Implemented using Tensorflow's DNNClassifier. - epochs: Integer - default: 30 - - Number of iterations to pass over all repos in a source + - Number of iterations to pass over all records in a source - directory: String @@ -285,7 +285,7 @@ predict). - epochs: Integer - default: 30 - - Number of iterations to pass over all repos in a source + - Number of iterations to pass over all records in a source - directory: String @@ -463,7 +463,7 @@ Implemented using Tensorflow hub pretrained models. - epochs: Integer - default: 10 - - Number of iterations to pass over all repos in a source + - Number of iterations to pass over all records in a source - directory: String diff --git a/docs/plugins/dffml_operation.rst b/docs/plugins/dffml_operation.rst index d06e2dee52..5e9355b95f 100644 --- a/docs/plugins/dffml_operation.rst +++ b/docs/plugins/dffml_operation.rst @@ -112,7 +112,7 @@ No description **Inputs** -- features: repo_features(type: Dict[str, Any]) +- features: record_features(type: Dict[str, Any]) **Outputs** diff --git a/docs/plugins/dffml_source.rst b/docs/plugins/dffml_source.rst index 4ddffe495d..4d9be18446 100644 --- a/docs/plugins/dffml_source.rst +++ b/docs/plugins/dffml_source.rst @@ -22,7 +22,7 @@ csv *Core* -Uses a CSV file as the source of repo feature data +Uses a CSV file as the source of record feature data **Args** @@ -123,11 +123,11 @@ memory *Core* -Stores repos in a dict in memory +Stores records in a dict in memory **Args** -- repos: List of repos +- records: List of records dffml_source_mysql ------------------ @@ -160,17 +160,17 @@ No description - db: String -- repos-query: String +- records-query: String - - SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM repo_data + - SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM record_data -- repo-query: String +- record-query: String - - SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM repo_data WHERE `key`=%s + - SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM record_data WHERE `key`=%s - update-query: String - - INSERT INTO repo_data (`key`, data_1, data_2) VALUES(%s, %s, %s) ON DUPLICATE KEY UPDATE data_1 = %s, data_2=%s + - INSERT INTO record_data (`key`, data_1, data_2) VALUES(%s, %s, %s) ON DUPLICATE KEY UPDATE data_1 = %s, data_2=%s - model-columns: List of strings diff --git a/docs/tutorials/intro.rst b/docs/tutorials/intro.rst index 125ff8c175..45e385ada0 100644 --- a/docs/tutorials/intro.rst +++ b/docs/tutorials/intro.rst @@ -31,28 +31,19 @@ By providing a generic abstraction around where data is being saved and stored, model implementations can access the data via the same API no matter where it is. -Repos +Records ~~~~~ -A common construct within DFFML is the ``Repo``. A ``Repo`` object is a -repository of information associated with a unique key. The ``Repo`` holds all +A common construct within DFFML is the ``Record``. A ``Record`` object is a +repository of information associated with a unique key. The ``Record`` holds all the data associated with that key. -.. note:: - - DFFML started as a project centred around Git repos. As such, the unique key - for a Git repo was the upstream source URL. ``src_url`` was used as the - parameter to hold the unique key. - - Now as DFFML has become more general purpose all instances of ``Repo.src_url`` have - been changed to ``Repo.key`` `#233 `_. - Say for instance you generated a dataset that had to do with cities. Your unique key might be the name of the city, the state or province it's in, and the country it's in. For example: ``Portland, OR, USA``. -The data associated with a ``Repo`` is called the feature data. Its stored -within a key value mapping within the ``Repo`` accessible via the +The data associated with a ``Record`` is called the feature data. Its stored +within a key value mapping within the ``Record`` accessible via the ``features()`` method. Our city example might have the following feature data. .. code-block:: json diff --git a/docs/tutorials/model.rst b/docs/tutorials/model.rst index 0a7b2537dc..8a54f2262c 100644 --- a/docs/tutorials/model.rst +++ b/docs/tutorials/model.rst @@ -33,7 +33,7 @@ Create the Package To create a new model we first create a new python package. DFFML has a script to create it for you. -We're going to create a model that matches a new repo exactly to anything it's +We're going to create a model that matches a new record exactly to anything it's seen before. If it has seen it before it will be 100% confident. If it hasn't seen it before it'll pick a random classification and assign that as it's prediction with a 0% confidence. @@ -70,7 +70,7 @@ Let's once again rename ``Misc`` to ``Hindsight``, this time the class in the fi class Hindsight(Model): ''' - A model which matches a new repo exactly to anything it's seen before. If it + A model which matches a new record exactly to anything it's seen before. If it has seen it before it will be 100% confident. If it hasn't seen it before it'll pick a random classification and assign that as it's prediction with a 0% confidence. @@ -81,7 +81,7 @@ Let's once again rename ``Misc`` to ``Hindsight``, this time the class in the fi # We aren't worring about saving and loading our model in this tutorial. # You should implement it when you write your model for real. self.model_dir = model_dir - # Let's add a dict where we'll store what we know about previous repos. + # Let's add a dict where we'll store what we know about previous records. self.mem = {} Train @@ -91,24 +91,24 @@ This model predicts completely based on what features has seen as they map to a classification. As such, we're going to take the JSON representation of each set of feature data -for each repo we are given to train on, and hash it, storing the hash as the key +for each record we are given to train on, and hash it, storing the hash as the key in our ``self.mem`` dict, and the value as the classification. This will let us -hash the feature data of future repos we are asked to predict on and pull their +hash the feature data of future records we are asked to predict on and pull their expected classification (for ones we've seen before). .. code-block:: python async def train(self, sources: Sources): ''' - Train using repos as the data to learn from. + Train using records as the data to learn from. ''' - async for repo in sources.with_features(features): + async for record in sources.with_features(features): # Make sure we are only dealing with classifications we care about - if repo.classification() in classifications: - # Hash the data of the repo and map it to the classification - as_json = bytes(json.dumps(repo.data.features)) + if record.classification() in classifications: + # Hash the data of the record and map it to the classification + as_json = bytes(json.dumps(record.data.features)) hash_json = hashlib.sha384(as_json).hexdigest() - self.mem[hash_json] = repo.classification() + self.mem[hash_json] = record.classification() Accuracy ~~~~~~~~ @@ -121,7 +121,7 @@ in this tutorial (because we know the accuracy of this demo model will suck). async def accuracy(self, sources: Sources) -> Accuracy: ''' - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. ''' # Lies @@ -130,31 +130,31 @@ in this tutorial (because we know the accuracy of this demo model will suck). Predict ~~~~~~~ -The prediction, we'll just need to hash the features of each repo we're asked to +The prediction, we'll just need to hash the features of each record we're asked to make a prediction for. And see if it's in the existing mapping. If not, then we'll just choose a random classification for it and call that good (with a 0% confidence). .. code-block:: python - async def predict(self, repos: AsyncIterator[Repo]) -> \ - AsyncIterator[Tuple[Repo, Any, float]]: + async def predict(self, records: AsyncIterator[Record]) -> \ + AsyncIterator[Tuple[Record, Any, float]]: ''' - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. ''' - # Pull all repos which have the features we are interested in. - async for repo in repos: - # Hash the data of the repo and map it to the classification - as_json = bytes(json.dumps(repo.data.features()), 'utf-8') + # Pull all records which have the features we are interested in. + async for record in records: + # Hash the data of the record and map it to the classification + as_json = bytes(json.dumps(record.data.features()), 'utf-8') hash_json = hashlib.sha384(as_json).hexdigest() # If the mapping exists then that's what we'll predict if hash_json in self.mem: # Send it back with 100% (1.0) confidence - yield repo, self.mem[hash_json], 1.0 + yield record, self.mem[hash_json], 1.0 else: # The feature hash doesn't exist in our mapping. # Pick a random classification and yield it with 0 confidence - yield repo, random.choice(self.parent.config.classifications), 0.0 + yield record, random.choice(self.parent.config.classifications), 0.0 Correct the plugin load path ---------------------------- diff --git a/docs/tutorials/source.rst b/docs/tutorials/source.rst index 3642e5e499..48b70117ab 100644 --- a/docs/tutorials/source.rst +++ b/docs/tutorials/source.rst @@ -20,7 +20,7 @@ to create it for you. $ dffml service dev create source my-source $ cd my-source -This creates a Python package for you with a source that stores ``Repo`` objects +This creates a Python package for you with a source that stores ``Record`` objects in memory, called ``MiscSource``, and some tests. Edit the Source diff --git a/docs/usage/dataflows.rst b/docs/usage/dataflows.rst index ae58ce3a81..03f776e11d 100644 --- a/docs/usage/dataflows.rst +++ b/docs/usage/dataflows.rst @@ -218,7 +218,7 @@ The ``lines_of_code_to_comments`` operation needs the output given by A ``git_repository_checked_out`` is defined as: - - repo: git_repository_checked_out(type: Dict[str, str]) + - record: git_repository_checked_out(type: Dict[str, str]) - URL: str - directory: str diff --git a/docs/usage/integration.rst b/docs/usage/integration.rst index d183064f8a..f26bbc033e 100644 --- a/docs/usage/integration.rst +++ b/docs/usage/integration.rst @@ -174,8 +174,8 @@ range. .. literalinclude:: /../feature/git/dffml_feature_git/feature/operations.py :linenos: - :lineno-start: 349 - :lines: 349-368 + :lineno-start: 363 + :lines: 363-394 Since operations are run concurrently with each other, DFFML manages locking of input data, such as git repositories. This is done via ``Definitions`` which are @@ -264,9 +264,9 @@ This command runs the dataflow on a set of repos, that set being the URLs in .. code-block:: console - $ dffml dataflow run repos set \ + $ dffml dataflow run records set \ -keys $(cat /tmp/urls) \ - -repo-def URL \ + -record-def URL \ -dataflow cgi-bin/dataflow.yaml \ -sources gathered=json \ -source-filename /tmp/data.json \ @@ -382,7 +382,7 @@ up as expected. .. code-block:: console - $ dffml list repos -sources db=demoapp + $ dffml list records -sources db=demoapp Training our Model ------------------ @@ -440,9 +440,9 @@ Run the operations on the new repo: ``https://github.com/intel/dffml.git`` .. code-block:: console - $ dffml dataflow run repos set \ + $ dffml dataflow run records set \ -keys https://github.com/intel/dffml.git \ - -repo-def URL \ + -record-def URL \ -dataflow cgi-bin/dataflow.yaml \ -sources db=demoapp @@ -450,7 +450,7 @@ Now that we have the data for the new repo, ask the model for a prediction. .. code-block:: console - $ dffml predict repo \ + $ dffml predict record \ -keys https://github.com/intel/dffml.git \ -model tfdnnc \ -model-predict maintained:str:1 \ diff --git a/examples/maintained/cgi-bin/api-ml.py b/examples/maintained/cgi-bin/api-ml.py index 3b7105af3c..9495d6c72f 100755 --- a/examples/maintained/cgi-bin/api-ml.py +++ b/examples/maintained/cgi-bin/api-ml.py @@ -39,14 +39,14 @@ elif action == 'predict': today = datetime.now().strftime('%Y-%m-%d %H:%M') subprocess.check_call([ - "dffml", "dataflow", "run", "repos", "set", + "dffml", "dataflow", "run", "records", "set", "-keys", query['URL'], - "-repo-def", "URL", + "-record-def", "URL", "-dataflow", os.path.join(os.path.dirname(__file__), "dataflow.yaml"), "-sources", "db=demoapp", ]) result = subprocess.check_output([ - 'dffml', 'predict', 'repo', + 'dffml', 'predict', 'record', '-keys', query['URL'], '-model', 'tfdnnc', '-model-predict', 'maintained', diff --git a/examples/maintained/demoapp/source.py b/examples/maintained/demoapp/source.py index 24389e7fc5..d7ae275cd5 100644 --- a/examples/maintained/demoapp/source.py +++ b/examples/maintained/demoapp/source.py @@ -3,7 +3,7 @@ from typing import AsyncIterator, NamedTuple, Dict from dffml.base import BaseConfig -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import BaseSourceContext, BaseSource from dffml.util.cli.arg import Arg from dffml.util.entrypoint import entrypoint @@ -18,43 +18,43 @@ class DemoAppSourceConfig(BaseConfig, NamedTuple): class DemoAppSourceContext(BaseSourceContext): - async def update(self, repo: Repo): + async def update(self, record: Record): db = self.conn # Just dump it (if you want a setup the queries easily, then you need to # massage the columns in this table to your liking, and perhaps add more # tables. - marshall = json.dumps(repo.dict()) + marshall = json.dumps(record.dict()) await db.execute( "INSERT INTO ml_data (key, json) VALUES(%s, %s) " "ON DUPLICATE KEY UPDATE json = %s", - (repo.key, marshall, marshall), + (record.key, marshall, marshall), ) self.logger.debug("updated: %s", marshall) - self.logger.debug("update: %s", await self.repo(repo.key)) + self.logger.debug("update: %s", await self.record(record.key)) - async def repos(self) -> AsyncIterator[Repo]: + async def records(self) -> AsyncIterator[Record]: await self.conn.execute("SELECT key FROM `status`") keys = set(map(lambda row: row[0], await self.conn.fetchall())) await self.conn.execute("SELECT key FROM `ml_data`") list(map(lambda row: keys.add(row[0]), await self.conn.fetchall())) for key in keys: - yield await self.repo(key) + yield await self.record(key) - async def repo(self, key: str): - repo = Repo(key) + async def record(self, key: str): + record = Record(key) db = self.conn # Get features await db.execute("SELECT json FROM ml_data WHERE key=%s", (key,)) dump = await db.fetchone() if dump is not None and dump[0] is not None: - repo.merge(Repo(key, data=json.loads(dump[0]))) + record.merge(Record(key, data=json.loads(dump[0]))) await db.execute( "SELECT maintained FROM `status` WHERE key=%s", (key,) ) maintained = await db.fetchone() if maintained is not None and maintained[0] is not None: - repo.evaluated({"maintained": str(maintained[0])}) - return repo + record.evaluated({"maintained": str(maintained[0])}) + return record async def __aenter__(self) -> "DemoAppSourceContext": self.__conn = self.parent.db.cursor() diff --git a/examples/source/custom_sqlite.py b/examples/source/custom_sqlite.py index 606d0d45f4..7e5f2e4060 100644 --- a/examples/source/custom_sqlite.py +++ b/examples/source/custom_sqlite.py @@ -3,7 +3,7 @@ from typing import AsyncIterator, NamedTuple, Dict from dffml.base import BaseConfig -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import BaseSourceContext, BaseSource from dffml.util.cli.arg import Arg @@ -13,22 +13,22 @@ class CustomSQLiteSourceConfig(BaseConfig, NamedTuple): class CustomSQLiteSourceContext(BaseSourceContext): - async def update(self, repo: Repo): + async def update(self, record: Record): db = self.parent.db # Store feature data feature_cols = self.parent.FEATURE_COLS feature_data = OrderedDict.fromkeys(feature_cols) - feature_data.update(repo.features(feature_cols)) + feature_data.update(record.features(feature_cols)) await db.execute( "INSERT OR REPLACE INTO features (key, " + ", ".join(feature_cols) + ") " "VALUES(?, " + ", ".join("?" * len(feature_cols)) + ")", - [repo.key] + list(feature_data.values()), + [record.key] + list(feature_data.values()), ) # Store prediction try: - prediction = repo.prediction("target_name") + prediction = record.prediction("target_name") prediction_cols = self.parent.PREDICTION_COLS prediction_data = OrderedDict.fromkeys(prediction_cols) prediction_data.update(prediction.dict()) @@ -37,39 +37,39 @@ async def update(self, repo: Repo): + ", ".join(prediction_cols) + ") " "VALUES(?, " + ", ".join("?" * len(prediction_cols)) + ")", - [repo.key] + list(prediction_data.values()), + [record.key] + list(prediction_data.values()), ) except KeyError: pass - async def repos(self) -> AsyncIterator[Repo]: + async def records(self) -> AsyncIterator[Record]: # NOTE This logic probably isn't what you want. Only for demo purposes. keys = await self.parent.db.execute("SELECT key FROM features") for row in await keys.fetchall(): - yield await self.repo(row["key"]) + yield await self.record(row["key"]) - async def repo(self, key: str): + async def record(self, key: str): db = self.parent.db - repo = Repo(key) + record = Record(key) # Get features features = await db.execute( "SELECT " + ", ".join(self.parent.FEATURE_COLS) + " " "FROM features WHERE key=?", - (repo.key,), + (record.key,), ) features = await features.fetchone() if features is not None: - repo.evaluated(features) + record.evaluated(features) # Get prediction prediction = await db.execute( - "SELECT * FROM prediction WHERE " "key=?", (repo.key,) + "SELECT * FROM prediction WHERE " "key=?", (record.key,) ) prediction = await prediction.fetchone() if prediction is not None: - repo.predicted( + record.predicted( "target_name", prediction["value"], prediction["confidence"] ) - return repo + return record async def __aexit__(self, exc_type, exc_value, traceback): await self.parent.db.commit() diff --git a/examples/test_quickstart.py b/examples/test_quickstart.py index 829ee043df..cf3a297e49 100644 --- a/examples/test_quickstart.py +++ b/examples/test_quickstart.py @@ -58,11 +58,11 @@ def test_shell(self): self.assertEqual(stdout.decode().strip(), "1.0") # Make the prediction stdout = subprocess.check_output(["sh", sh_filepath("predict.sh")]) - repos = json.loads(stdout.decode()) + records = json.loads(stdout.decode()) # Check the salary self.assertEqual( - int(repos[0]["prediction"]["Salary"]["value"]), 70 + int(records[0]["prediction"]["Salary"]["value"]), 70 ) self.assertEqual( - int(repos[1]["prediction"]["Salary"]["value"]), 80 + int(records[1]["prediction"]["Salary"]["value"]), 80 ) diff --git a/model/scikit/dffml_model_scikit/scikit_base.py b/model/scikit/dffml_model_scikit/scikit_base.py index 7714d91c9b..ded817c39c 100644 --- a/model/scikit/dffml_model_scikit/scikit_base.py +++ b/model/scikit/dffml_model_scikit/scikit_base.py @@ -14,7 +14,7 @@ import pandas as pd from sklearn.metrics import silhouette_score, mutual_info_score -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.model.accuracy import Accuracy from dffml.model.model import ModelConfig, ModelContext, Model, ModelNotTrained @@ -76,17 +76,17 @@ async def __aexit__(self, exc_type, exc_value, traceback): async def train(self, sources: Sources): data = [] - async for repo in sources.with_features( + async for record in sources.with_features( self.features + [self.parent.config.predict.NAME] ): - feature_data = repo.features( + feature_data = record.features( self.features + [self.parent.config.predict.NAME] ) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop([self.parent.config.predict.NAME], 1)) ydata = np.array(df[self.parent.config.predict.NAME]) - self.logger.info("Number of input repos: {}".format(len(xdata))) + self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata, ydata) joblib.dump(self.clf, self._filename()) @@ -94,26 +94,26 @@ async def accuracy(self, sources: Sources) -> Accuracy: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before assessing for accuracy.") data = [] - async for repo in sources.with_features(self.features): - feature_data = repo.features( + async for record in sources.with_features(self.features): + feature_data = record.features( self.features + [self.parent.config.predict.NAME] ) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop([self.parent.config.predict.NAME], 1)) ydata = np.array(df[self.parent.config.predict.NAME]) - self.logger.debug("Number of input repos: {}".format(len(xdata))) + self.logger.debug("Number of input records: {}".format(len(xdata))) self.confidence = self.clf.score(xdata, ydata) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence async def predict( - self, repos: AsyncIterator[Repo] - ) -> AsyncIterator[Tuple[Repo, Any, float]]: + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before prediction.") - async for repo in repos: - feature_data = repo.features(self.features) + async for record in records: + feature_data = record.features(self.features) df = pd.DataFrame(feature_data, index=[0]) predict = np.array(df) self.logger.debug( @@ -124,10 +124,10 @@ async def predict( ) ) target = self.parent.config.predict.NAME - repo.predicted( + record.predicted( target, self.clf.predict(predict)[0], self.confidence ) - yield repo + yield record class ScikitContextUnsprvised(ScikitContext): @@ -145,12 +145,12 @@ async def __aenter__(self): async def train(self, sources: Sources): data = [] - async for repo in sources.with_features(self.features): - feature_data = repo.features(self.features) + async for record in sources.with_features(self.features): + feature_data = record.features(self.features) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df) - self.logger.info("Number of input repos: {}".format(len(xdata))) + self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata) joblib.dump(self.clf, self._filename()) @@ -166,12 +166,12 @@ async def accuracy(self, sources: Sources) -> Accuracy: if self.parent.config.tcluster is None else [self.parent.config.tcluster.NAME] ) - async for repo in sources.with_features(self.features): - feature_data = repo.features(self.features + target) + async for record in sources.with_features(self.features): + feature_data = record.features(self.features + target) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop(target, axis=1)) - self.logger.debug("Number of input repos: {}".format(len(xdata))) + self.logger.debug("Number of input records: {}".format(len(xdata))) if target: ydata = np.array(df[target]).flatten() if hasattr(self.clf, "predict"): @@ -203,8 +203,8 @@ async def accuracy(self, sources: Sources) -> Accuracy: return self.confidence async def predict( - self, repos: AsyncIterator[Repo] - ) -> AsyncIterator[Tuple[Repo, Any, float]]: + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before prediction.") estimator_type = self.clf._estimator_type @@ -222,8 +222,8 @@ async def predict( ] predictor = lambda predict: [next(labels)] - async for repo in repos: - feature_data = repo.features(self.features) + async for record in records: + feature_data = record.features(self.features) df = pd.DataFrame(feature_data, index=[0]) predict = np.array(df) prediction = predictor(predict) @@ -231,8 +231,8 @@ async def predict( "Predicted cluster for {}: {}".format(predict, prediction) ) target = self.parent.config.predict.NAME - repo.predicted(target, prediction[0], self.confidence) - yield repo + record.predicted(target, prediction[0], self.confidence) + yield record class Scikit(Model): diff --git a/model/scikit/tests/test_scikit.py b/model/scikit/tests/test_scikit.py index 477f3d3c80..91c801efdb 100644 --- a/model/scikit/tests/test_scikit.py +++ b/model/scikit/tests/test_scikit.py @@ -2,7 +2,7 @@ import tempfile import numpy as np -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import DefFeature, Features @@ -30,8 +30,8 @@ def setUpClass(cls): A, B, C, D, E, F, G, H, I, X = list( zip(*FEATURE_DATA_CLASSIFICATION) ) - cls.repos = [ - Repo( + cls.records = [ + Record( str(i), data={ "features": { @@ -55,8 +55,8 @@ def setUpClass(cls): cls.features.append(DefFeature("B", float, 1)) cls.features.append(DefFeature("C", float, 1)) A, B, C, X = list(zip(*FEATURE_DATA_REGRESSION)) - cls.repos = [ - Repo( + cls.records = [ + Record( str(i), data={ "features": { @@ -75,8 +75,8 @@ def setUpClass(cls): cls.features.append(DefFeature("C", float, 1)) cls.features.append(DefFeature("D", float, 1)) A, B, C, D, X = list(zip(*FEATURE_DATA_CLUSTERING)) - cls.repos = [ - Repo( + cls.records = [ + Record( str(i), data={ "features": { @@ -92,7 +92,7 @@ def setUpClass(cls): ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) properties = { "directory": cls.model_dir.name, @@ -131,12 +131,12 @@ async def test_02_predict(self): async with self.sources as sources, self.model as model: target = model.config.predict.NAME async with sources() as sctx, model() as mctx: - async for repo in mctx.predict(sctx.repos()): - prediction = repo.prediction(target).value + async for record in mctx.predict(sctx.records()): + prediction = record.prediction(target).value if self.MODEL_TYPE is "CLASSIFICATION": self.assertIn(prediction, [2, 4]) elif self.MODEL_TYPE is "REGRESSION": - correct = FEATURE_DATA_REGRESSION[int(repo.key)][3] + correct = FEATURE_DATA_REGRESSION[int(record.key)][3] self.assertGreater( prediction, correct - (correct * 0.40) ) diff --git a/model/scratch/dffml_model_scratch/slr.py b/model/scratch/dffml_model_scratch/slr.py index 1db184dac6..ff29757b10 100644 --- a/model/scratch/dffml_model_scratch/slr.py +++ b/model/scratch/dffml_model_scratch/slr.py @@ -10,7 +10,7 @@ import numpy as np -from dffml.repo import Repo +from dffml.record import Record from dffml.base import config, field from dffml.source.source import Sources from dffml.model.accuracy import Accuracy @@ -88,7 +88,9 @@ async def coeff_of_deter(self, ys, regression_line): return 1 - (squared_error_regression / squared_error_mean) async def best_fit_line(self): - self.logger.debug("Number of input repos: {}".format(len(self.xData))) + self.logger.debug( + "Number of input records: {}".format(len(self.xData)) + ) x = self.xData y = self.yData mean_x = np.mean(self.xData) @@ -102,10 +104,10 @@ async def best_fit_line(self): return (m, b, accuracy) async def train(self, sources: Sources): - async for repo in sources.with_features( + async for record in sources.with_features( self.features + [self.parent.config.predict.NAME] ): - feature_data = repo.features( + feature_data = record.features( self.features + [self.parent.config.predict.NAME] ) self.xData = np.append(self.xData, feature_data[self.features[0]]) @@ -121,19 +123,19 @@ async def accuracy(self, sources: Sources) -> Accuracy: return Accuracy(accuracy_value) async def predict( - self, repos: AsyncIterator[Repo] - ) -> AsyncIterator[Tuple[Repo, Any, float]]: + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Tuple[Record, Any, float]]: if self.regression_line is None: raise ModelNotTrained("Train model before prediction.") target = self.parent.config.predict.NAME - async for repo in repos: - feature_data = repo.features(self.features) - repo.predicted( + async for record in records: + feature_data = record.features(self.features) + record.predicted( target, await self.predict_input(feature_data[self.features[0]]), self.regression_line[2], ) - yield repo + yield record @entrypoint("slr") diff --git a/model/scratch/tests/test_slr.py b/model/scratch/tests/test_slr.py index bb78529e61..e32dd4f75e 100644 --- a/model/scratch/tests/test_slr.py +++ b/model/scratch/tests/test_slr.py @@ -1,7 +1,6 @@ import tempfile - -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.util.asynctestcase import AsyncTestCase @@ -43,12 +42,12 @@ def setUpClass(cls): cls.feature = DefFeature("X", float, 1) cls.features = Features(cls.feature) X, Y = list(zip(*FEATURE_DATA)) - cls.repos = [ - Repo(str(i), data={"features": {"X": X[i], "Y": Y[i]}}) + cls.records = [ + Record(str(i), data={"features": {"X": X[i], "Y": Y[i]}}) for i in range(0, len(Y)) ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = SLR( SLRConfig( @@ -72,10 +71,10 @@ async def test_context(self): self.assertTrue(0.0 <= res < 1.0) # Test predict target_name = model.config.predict.NAME - async for repo in mctx.predict(sctx.repos()): - correct = FEATURE_DATA[int(repo.key)][1] + async for record in mctx.predict(sctx.records()): + correct = FEATURE_DATA[int(record.key)][1] # Comparison of correct to prediction to make sure prediction is within a reasonable range - prediction = repo.prediction(target_name).value + prediction = record.prediction(target_name).value self.assertGreater(prediction, correct - (correct * 0.10)) self.assertLess(prediction, correct + (correct * 0.10)) @@ -94,9 +93,9 @@ async def test_02_predict(self): async with self.sources as sources, self.model as model: async with sources() as sctx, model() as mctx: target_name = model.config.predict.NAME - async for repo in mctx.predict(sctx.repos()): + async for record in mctx.predict(sctx.records()): target_name = model.config.predict.NAME - correct = FEATURE_DATA[int(repo.key)][1] - prediction = repo.prediction(target_name).value + correct = FEATURE_DATA[int(record.key)][1] + prediction = record.prediction(target_name).value self.assertGreater(prediction, correct - (correct * 0.10)) self.assertLess(prediction, correct + (correct * 0.10)) diff --git a/model/tensorflow/dffml_model_tensorflow/dnnc.py b/model/tensorflow/dffml_model_tensorflow/dnnc.py index 87dfe81f56..d6d7942411 100644 --- a/model/tensorflow/dffml_model_tensorflow/dnnc.py +++ b/model/tensorflow/dffml_model_tensorflow/dnnc.py @@ -1,6 +1,6 @@ """ Uses Tensorflow to create a generic DNN which learns on all of the features in a -repo. +record. """ import os import abc @@ -13,7 +13,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" import tensorflow as tf -from dffml.repo import Repo +from dffml.record import Record from dffml.model.accuracy import Accuracy from dffml.base import config, field from dffml.source.source import Sources @@ -37,7 +37,7 @@ def __init__(self, parent): def _feature_columns(self): """ - Converts repos into training data + Converts records into training data """ cols: Dict[str, Any] = {} for feature in self.parent.config.features: @@ -92,27 +92,27 @@ def _model_dir_path(self): ) return os.path.join(self.parent.config.directory, model) - async def predict_input_fn(self, repos: AsyncIterator[Repo], **kwargs): + async def predict_input_fn(self, records: AsyncIterator[Record], **kwargs): """ - Uses the numpy input function with data from repo features. + Uses the numpy input function with data from record features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} - ret_repos = [] - async for repo in repos: - if not repo.features(self.features): + ret_records = [] + async for record in records: + if not record.features(self.features): continue - ret_repos.append(repo) - for feature, results in repo.features(self.features).items(): + ret_records.append(record) + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("-----------------------") input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x_cols, shuffle=False, num_epochs=1, **kwargs ) - return input_fn, ret_repos + return input_fn, ret_records async def train(self, sources: Sources): """ @@ -136,12 +136,14 @@ class DNNClassifierModelConfig: features: Features = field("Features to train on") clstype: Type = field("Data type of classifications values", default=str) batchsize: int = field( - "Number repos to pass through in an epoch", default=20 + "Number records to pass through in an epoch", default=20 + ) + shuffle: bool = field( + "Randomise order of records in a batch", default=True ) - shuffle: bool = field("Randomise order of repos in a batch", default=True) steps: int = field("Number of steps to train the model", default=3000) epochs: int = field( - "Number of iterations to pass over all repos in a source", default=30 + "Number of iterations to pass over all records in a source", default=30 ) directory: str = field( "Directory where state should be saved", @@ -213,32 +215,32 @@ def model(self): async def training_input_fn(self, sources: Sources, **kwargs): """ - Uses the numpy input function with data from repo features. + Uses the numpy input function with data from record features. """ self.logger.debug("Training on features: %r", self.features) x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] - for repo in [ - repo - async for repo in sources.with_features( + for record in [ + record + async for record in sources.with_features( self.features + [self.parent.config.predict.NAME] ) - if repo.feature(self.parent.config.predict.NAME) + if record.feature(self.parent.config.predict.NAME) in self.classifications ]: - for feature, results in repo.features(self.features).items(): + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append( self.classifications[ - repo.feature(self.parent.config.predict.NAME) + record.feature(self.parent.config.predict.NAME) ] ) if not y_cols: - raise ValueError("No repos to train on") + raise ValueError("No records to train on") y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") @@ -254,29 +256,29 @@ async def training_input_fn(self, sources: Sources, **kwargs): async def accuracy_input_fn(self, sources: Sources, **kwargs): """ - Uses the numpy input function with data from repo features. + Uses the numpy input function with data from record features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] - for repo in [ - repo - async for repo in sources.with_features( + for record in [ + record + async for record in sources.with_features( self.features + [self.parent.config.predict.NAME] ) - if repo.feature(self.parent.config.predict.NAME) + if record.feature(self.parent.config.predict.NAME) in self.classifications ]: - for feature, results in repo.features(self.features).items(): + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append( self.classifications[ - repo.feature(self.parent.config.predict.NAME) + record.feature(self.parent.config.predict.NAME) ] ) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") @@ -292,7 +294,7 @@ async def accuracy_input_fn(self, sources: Sources, **kwargs): async def accuracy(self, sources: Sources) -> Accuracy: """ - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. """ if not os.path.isdir(self.model_dir_path): @@ -301,22 +303,24 @@ async def accuracy(self, sources: Sources) -> Accuracy: accuracy_score = self.model.evaluate(input_fn=input_fn) return Accuracy(accuracy_score["accuracy"]) - async def predict(self, repos: AsyncIterator[Repo]) -> AsyncIterator[Repo]: + async def predict( + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Record]: """ - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. """ if not os.path.isdir(self.model_dir_path): raise ModelNotTrained("Train model before prediction.") # Create the input function - input_fn, predict = await self.predict_input_fn(repos) + input_fn, predict = await self.predict_input_fn(records) # Makes predictions on classifications predictions = self.model.predict(input_fn=input_fn) target = self.parent.config.predict.NAME - for repo, pred_dict in zip(predict, predictions): + for record, pred_dict in zip(predict, predictions): class_id = pred_dict["class_ids"][0] probability = pred_dict["probabilities"][class_id] - repo.predicted(target, self.cids[class_id], probability) - yield repo + record.predicted(target, self.cids[class_id], probability) + yield record @entrypoint("tfdnnc") diff --git a/model/tensorflow/dffml_model_tensorflow/dnnr.py b/model/tensorflow/dffml_model_tensorflow/dnnr.py index 5e71e1f763..8269fa034c 100644 --- a/model/tensorflow/dffml_model_tensorflow/dnnr.py +++ b/model/tensorflow/dffml_model_tensorflow/dnnr.py @@ -1,6 +1,6 @@ """ Uses Tensorflow to create a generic DNN which learns on all of the features in a -repo. +record. """ import os from typing import List, Dict, Any, AsyncIterator @@ -10,7 +10,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" import tensorflow as tf -from dffml.repo import Repo +from dffml.record import Record from dffml.model.model import Model from dffml.model.accuracy import Accuracy from dffml.source.source import Sources @@ -27,7 +27,7 @@ class DNNRegressionModelConfig: features: Features = field("Features to train on") steps: int = field("Number of steps to train the model", default=3000) epochs: int = field( - "Number of iterations to pass over all repos in a source", default=30 + "Number of iterations to pass over all records in a source", default=30 ) directory: str = field( "Directory where state should be saved", @@ -81,22 +81,22 @@ async def training_input_fn( **kwargs, ): """ - Uses the numpy input function with data from repo features. + Uses the numpy input function with data from record features. """ self.logger.debug("Training on features: %r", self.features) x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] - async for repo in sources.with_features(self.all_features): - for feature, results in repo.features(self.features).items(): + async for record in sources.with_features(self.all_features): + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) - y_cols.append(repo.feature(self.parent.config.predict.NAME)) + y_cols.append(record.feature(self.parent.config.predict.NAME)) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") @@ -119,20 +119,20 @@ async def evaluate_input_fn( **kwargs, ): """ - Uses the numpy input function with data from repo features. + Uses the numpy input function with data from record features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] - async for repo in sources.with_features(self.all_features): - for feature, results in repo.features(self.features).items(): + async for record in sources.with_features(self.all_features): + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) - y_cols.append(repo.feature(self.parent.config.predict.NAME)) + y_cols.append(record.feature(self.parent.config.predict.NAME)) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") @@ -148,7 +148,7 @@ async def evaluate_input_fn( async def accuracy(self, sources: Sources) -> Accuracy: """ - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. """ if not os.path.isdir(self.model_dir_path): @@ -159,24 +159,26 @@ async def accuracy(self, sources: Sources) -> Accuracy: metrics = self.model.evaluate(input_fn=input_fn) return Accuracy(1 - metrics["loss"]) # 1 - mse - async def predict(self, repos: AsyncIterator[Repo]) -> AsyncIterator[Repo]: + async def predict( + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Record]: """ - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. """ if not os.path.isdir(self.model_dir_path): raise NotADirectoryError("Model not trained") # Create the input function - input_fn, predict_repo = await self.predict_input_fn(repos) + input_fn, predict_record = await self.predict_input_fn(records) # Makes predictions on predictions = self.model.predict(input_fn=input_fn) target = self.parent.config.predict.NAME - for repo, pred_dict in zip(predict_repo, predictions): + for record, pred_dict in zip(predict_record, predictions): # TODO Instead of float("nan") save accuracy value and use that. - repo.predicted( + record.predicted( target, float(pred_dict["predictions"]), float("nan") ) - yield repo + yield record @entrypoint("tfdnnr") diff --git a/model/tensorflow/tests/test_dnnc.py b/model/tensorflow/tests/test_dnnc.py index d06a5c3321..115d59cbc2 100644 --- a/model/tensorflow/tests/test_dnnc.py +++ b/model/tensorflow/tests/test_dnnc.py @@ -3,7 +3,7 @@ import tempfile from typing import Type -from dffml.repo import Repo, RepoData +from dffml.record import Record, RecordData from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import Data, Feature, Features, DefFeature @@ -33,22 +33,22 @@ def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = StartsWithA() cls.features = Features(cls.feature) - cls.repos = [ - Repo( + cls.records = [ + Record( "a" + str(random.random()), data={"features": {cls.feature.NAME: 1, "string": "a"}}, ) for _ in range(0, 1000) ] - cls.repos += [ - Repo( + cls.records += [ + Record( "b" + str(random.random()), data={"features": {cls.feature.NAME: 0, "string": "not a"}}, ) for _ in range(0, 1000) ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = DNNClassifierModel( DNNClassifierModelConfig( @@ -107,13 +107,13 @@ async def test_01_accuracy(self): self.assertGreater(res, 0.9) async def test_02_predict(self): - a = Repo("a", data={"features": {self.feature.NAME: 1}}) + a = Record("a", data={"features": {self.feature.NAME: 1}}) async with Sources( - MemorySource(MemorySourceConfig(repos=[a])) + MemorySource(MemorySourceConfig(records=[a])) ) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: - res = [repo async for repo in mctx.predict(sctx.repos())] + res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) self.assertTrue(res[0].prediction(target_name).value) diff --git a/model/tensorflow/tests/test_dnnr.py b/model/tensorflow/tests/test_dnnr.py index 537a5cd059..9871e35f50 100644 --- a/model/tensorflow/tests/test_dnnr.py +++ b/model/tensorflow/tests/test_dnnr.py @@ -5,7 +5,7 @@ import numpy as np -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.util.cli.arg import parse_unknown @@ -60,8 +60,8 @@ def setUpClass(cls): # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) - cls.repos = [ - Repo( + cls.records = [ + Record( "x" + str(random.random()), data={ "features": { @@ -74,7 +74,7 @@ def setUpClass(cls): for i in range(0, _n_data) ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) @classmethod @@ -123,7 +123,7 @@ async def test_02_predict(self): ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass - a = Repo( + a = Record( "a", data={ "features": { @@ -133,11 +133,11 @@ async def test_02_predict(self): }, ) async with Sources( - MemorySource(MemorySourceConfig(repos=[a])) + MemorySource(MemorySourceConfig(records=[a])) ) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: - res = [repo async for repo in mctx.predict(sctx.repos())] + res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( diff --git a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py index 114bc53943..ac4f342a20 100644 --- a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py +++ b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py @@ -15,7 +15,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" import tensorflow as tf -from dffml.repo import Repo +from dffml.record import Record from dffml.model.accuracy import Accuracy from dffml.source.source import Sources from dffml.util.entrypoint import entrypoint @@ -65,7 +65,7 @@ class TextClassifierConfig: metrics: str = field("Metric used to evaluate model", default="accuracy") clstype: Type = field("Data type of classifications values", default=str) epochs: int = field( - "Number of iterations to pass over all repos in a source", default=10 + "Number of iterations to pass over all records in a source", default=10 ) directory: str = field( "Directory where state should be saved", @@ -191,25 +191,25 @@ async def train_data_generator(self, sources: Sources): self.logger.debug("Training on features: %r", self.features) x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] - all_repos = [] + all_records = [] all_sources = sources.with_features( self.features + [self.classification] ) - async for repo in all_sources: - if repo.feature(self.classification) in self.classifications: - all_repos.append(repo) - for repo in all_repos: - for feature, results in repo.features(self.features).items(): + async for record in all_sources: + if record.feature(self.classification) in self.classifications: + all_records.append(record) + for record in all_records: + for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append( - self.classifications[repo.feature(self.classification)] + self.classifications[record.feature(self.classification)] ) if not y_cols: - raise ValueError("No repos to train on") + raise ValueError("No records to train on") y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) - self.logger.info("------ Repo Data ------") + self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") @@ -261,7 +261,7 @@ async def prediction_data_generator( async def train(self, sources: Sources): """ - Train using repos as the data to learn from. + Train using records as the data to learn from. """ x, y = await self.train_data_generator(sources) self._model.summary() @@ -276,7 +276,7 @@ async def train(self, sources: Sources): async def accuracy(self, sources: Sources) -> Accuracy: """ - Evaluates the accuracy of our model after training using the input repos + Evaluates the accuracy of our model after training using the input records as test data. """ if not os.path.isfile( @@ -288,18 +288,18 @@ async def accuracy(self, sources: Sources) -> Accuracy: return Accuracy(accuracy_score[1]) async def predict( - self, repos: AsyncIterator[Repo] - ) -> AsyncIterator[Tuple[Repo, Any, float]]: + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Tuple[Record, Any, float]]: """ - Uses trained data to make a prediction about the quality of a repo. + Uses trained data to make a prediction about the quality of a record. """ if not os.path.isfile( os.path.join(self.model_dir_path, "saved_model.pb") ): raise ModelNotTrained("Train model before assessing for accuracy.") - async for repo in repos: - feature_data = repo.features(self.features) + async for record in records: + feature_data = record.features(self.features) df = pd.DataFrame(feature_data, index=[0]) predict = await self.prediction_data_generator(np.array(df)[0]) all_prob = self._model.predict(predict) @@ -313,12 +313,12 @@ async def predict( ) ) - repo.predicted( + record.predicted( target, self.cids[max_prob_idx[0]], all_prob[0][max_prob_idx[0]], ) - yield repo + yield record @entrypoint("text_classifier") diff --git a/model/tensorflow_hub/tests/test_model.py b/model/tensorflow_hub/tests/test_model.py index 5d7fc64a28..ba6a1f0898 100644 --- a/model/tensorflow_hub/tests/test_model.py +++ b/model/tensorflow_hub/tests/test_model.py @@ -1,7 +1,7 @@ import random import tempfile -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import Sources from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Features, DefFeature @@ -18,12 +18,12 @@ def setUpClass(cls): cls.features = Features() cls.features.append(DefFeature("A", str, 1)) A, X = list(zip(*DATA)) - cls.repos = [ - Repo(str(i), data={"features": {"A": A[i], "X": X[i],}},) + cls.records = [ + Record(str(i), data={"features": {"A": A[i], "X": X[i],}},) for i in range(0, len(X)) ] cls.sources = Sources( - MemorySource(MemorySourceConfig(repos=cls.repos)) + MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( @@ -62,8 +62,8 @@ async def test_02_predict(self): async with self.sources as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: - async for repo in mctx.predict(sctx.repos()): - prediction = repo.prediction(target_name).value + async for record in mctx.predict(sctx.records()): + prediction = record.prediction(target_name).value self.assertIn(prediction, ["0", "1"]) diff --git a/service/http/dffml_service_http/routes.py b/service/http/dffml_service_http/routes.py index 2639e53e6b..631703173b 100644 --- a/service/http/dffml_service_http/routes.py +++ b/service/http/dffml_service_http/routes.py @@ -13,7 +13,7 @@ from aiohttp import web import aiohttp_cors -from dffml.repo import Repo +from dffml.record import Record from dffml.df.types import DataFlow, Input from dffml.df.multicomm import MultiCommInAtomicMode, BaseMultiCommContext from dffml.df.memory import ( @@ -54,15 +54,15 @@ def default(self, obj): @dataclass class IterkeyEntry: """ - iterkeys will hold the first repo within the next iteration and ths - iterator. The first time around the first repo is None, since we haven't + iterkeys will hold the first record within the next iteration and ths + iterator. The first time around the first record is None, since we haven't iterated yet. We do this because if the chunk_size is the same as the number of iterations then we'll need to iterate one time more than chunk_size in order to hit the StopAsyncIteration exception. """ - first: Union[Repo, None] - repos: AsyncIterator[Repo] + first: Union[Record, None] + records: AsyncIterator[Record] def mcctx_route(handler): @@ -469,78 +469,78 @@ async def multicomm_register(self, request, mcctx): return web.json_response(OK) @sctx_route - async def source_repo(self, request, sctx): + async def source_record(self, request, sctx): return web.json_response( - (await sctx.repo(request.match_info["key"])).export() + (await sctx.record(request.match_info["key"])).export() ) @sctx_route async def source_update(self, request, sctx): await sctx.update( - Repo(request.match_info["key"], data=await request.json()) + Record(request.match_info["key"], data=await request.json()) ) return web.json_response(OK) - async def _iter_repos(self, iterkey, chunk_size) -> List[Repo]: + async def _iter_records(self, iterkey, chunk_size) -> List[Record]: """ - Iterates over a repos async generator and returns a list with chunk_size - or less repos in it (if iteration completed). It also returns the + Iterates over a records async generator and returns a list with chunk_size + or less records in it (if iteration completed). It also returns the iterkey, which will be None if iteration completed. """ - if not iterkey in self.app["source_repos_iterkeys"]: + if not iterkey in self.app["source_records_iterkeys"]: raise web.HTTPNotFound(reason="iterkey not found") - entry = self.app["source_repos_iterkeys"][iterkey] - # Make repo_list start with the last repo that was retrieved from - # iteration the last time _iter_repos was called. If this is the first - # time then repo_list is an empty list - repo_list = [entry.first] if entry.first is not None else [] + entry = self.app["source_records_iterkeys"][iterkey] + # Make record_list start with the last record that was retrieved from + # iteration the last time _iter_records was called. If this is the first + # time then record_list is an empty list + record_list = [entry.first] if entry.first is not None else [] # We need to iterate one more time than chunk_size the first time - # _iter_repos is called so that we return the chunk_size and set + # _iter_records is called so that we return the chunk_size and set # entry.first for the subsequent iterations - iter_until = chunk_size + 1 if not repo_list else chunk_size + iter_until = chunk_size + 1 if not record_list else chunk_size for i in range(0, iter_until): try: - # On last iteration make the repo the first repo in the next + # On last iteration make the record the first record in the next # iteration if i == (iter_until - 1): - entry.first = await entry.repos.__anext__() + entry.first = await entry.records.__anext__() else: - repo_list.append(await entry.repos.__anext__()) + record_list.append(await entry.records.__anext__()) except StopAsyncIteration: - # If we're done iterating over repos and can remove the + # If we're done iterating over records and can remove the # reference to the iterator from iterkeys - del self.app["source_repos_iterkeys"][iterkey] + del self.app["source_records_iterkeys"][iterkey] iterkey = None break - return iterkey, repo_list + return iterkey, record_list @sctx_route - async def source_repos(self, request, sctx): + async def source_records(self, request, sctx): iterkey = secrets.token_hex(nbytes=SECRETS_TOKEN_BYTES) # TODO Add test that iterkey is removed on last iteration - self.app["source_repos_iterkeys"][iterkey] = IterkeyEntry( - first=None, repos=sctx.repos() + self.app["source_records_iterkeys"][iterkey] = IterkeyEntry( + first=None, records=sctx.records() ) - iterkey, repos = await self._iter_repos( + iterkey, records = await self._iter_records( iterkey, int(request.match_info["chunk_size"]) ) return web.json_response( { "iterkey": iterkey, - "repos": {repo.key: repo.export() for repo in repos}, + "records": {record.key: record.export() for record in records}, } ) @sctx_route - async def source_repos_iter(self, request, sctx): - iterkey, repos = await self._iter_repos( + async def source_records_iter(self, request, sctx): + iterkey, records = await self._iter_records( request.match_info["iterkey"], int(request.match_info["chunk_size"]), ) return web.json_response( { "iterkey": iterkey, - "repos": {repo.key: repo.export() for repo in repos}, + "records": {record.key: record.export() for record in records}, } ) @@ -589,23 +589,23 @@ async def model_predict(self, request, mctx): {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) - # Get the repos - repos: Dict[str, Repo] = { - key: Repo(key, data=repo_data) - for key, repo_data in (await request.json()).items() + # Get the records + records: Dict[str, Record] = { + key: Record(key, data=record_data) + for key, record_data in (await request.json()).items() } - # Create an async generator to feed repos - async def repo_gen(): - for repo in repos.values(): - yield repo + # Create an async generator to feed records + async def record_gen(): + for record in records.values(): + yield record # Feed them through prediction return web.json_response( { "iterkey": None, - "repos": { - repo.key: repo.export() - async for repo in mctx.predict(repo_gen()) + "records": { + record.key: record.export() + async for record in mctx.predict(record_gen()) }, } ) @@ -634,7 +634,7 @@ async def setup(self, **kwargs): self.app["multicomm_routes"] = {} self.app["sources"] = {} self.app["source_contexts"] = {} - self.app["source_repos_iterkeys"] = {} + self.app["source_records_iterkeys"] = {} self.app["models"] = {} self.app["model_contexts"] = {} self.app.update(kwargs) @@ -676,17 +676,17 @@ async def setup(self, **kwargs): self.multicomm_register, ), # Source APIs - ("GET", "/source/{label}/repo/{key}", self.source_repo), + ("GET", "/source/{label}/record/{key}", self.source_record), ("POST", "/source/{label}/update/{key}", self.source_update), ( "GET", - "/source/{label}/repos/{chunk_size}", - self.source_repos, + "/source/{label}/records/{chunk_size}", + self.source_records, ), ( "GET", - "/source/{label}/repos/{iterkey}/{chunk_size}", - self.source_repos_iter, + "/source/{label}/records/{iterkey}/{chunk_size}", + self.source_records_iter, ), # TODO route to delete iterkey before iteration has completed # Model APIs diff --git a/service/http/docs/api.rst b/service/http/docs/api.rst index ec7ce2c135..dd7dcfd81f 100644 --- a/service/http/docs/api.rst +++ b/service/http/docs/api.rst @@ -331,40 +331,40 @@ return a 404, Not Found response. {"error": "Source not loaded"} -.. _repo: +.. _record: -Repo +Record ~~~~ -Access a repo by it's unique key. The response will be the JSON representation -of the repo. Here's an example response for a ``GET`` request. +Access a record by it's unique key. The response will be the JSON representation +of the record. Here's an example response for a ``GET`` request. -- ``/source/{ctx_label}/repo/{key}`` +- ``/source/{ctx_label}/record/{key}`` .. code-block:: json { - "key": "myrepo", + "key": "myrecord", "features": { "myfeature": "somevalue" } } -Just as with DFFML, you'll still get a repo even if the repo doesn't exist +Just as with DFFML, you'll still get a record even if the record doesn't exist within the source. However, it will only contain the ``key``. Update ~~~~~~ -Update a repo by it's unique key. ``POST`` data in the same format received from -repo. +Update a record by it's unique key. ``POST`` data in the same format received from +record. - ``/source/{ctx_label}/update/{key}`` .. code-block:: json { - "key": "myrepo", + "key": "myrecord", "features": { "myfeature": "somevalue" } @@ -377,35 +377,35 @@ response. {"error": null} -Repos +Records ~~~~~ Initially, client makes a ``GET`` request to the API with the ``chunk_size`` for -the first iteration. ``chunk_size`` is the number of repos to return in one +the first iteration. ``chunk_size`` is the number of records to return in one iteration. The response object will have two properties, ``iterkey`` and -``repos``. +``records``. -``repos`` is a key value mapping of repo ``key``'s to their JSON serialized -repo object. +``records`` is a key value mapping of record ``key``'s to their JSON serialized +record object. -``iterkey`` will be ``null`` if there are no more repos in the source. If -``iterkey`` is not ``null`` then there are more repos to iterate over. The API +``iterkey`` will be ``null`` if there are no more records in the source. If +``iterkey`` is not ``null`` then there are more records to iterate over. The API should be called using the response's ``iterkey`` value until the response contains an ``iterkey`` value of ``null``. -Sample response where ``chunk_size`` is ``1`` and there are more repos to +Sample response where ``chunk_size`` is ``1`` and there are more records to iterate over. We continue making ``GET`` requests until ``iterkey`` is ``null``. -- ``/source/{ctx_label}/repos/{chunk_size}`` -- ``/source/{ctx_label}/repos/{iterkey}/{chunk_size}`` +- ``/source/{ctx_label}/records/{chunk_size}`` +- ``/source/{ctx_label}/records/{iterkey}/{chunk_size}`` .. code-block:: json { "iterkey": "1a164836c6d8a27fdf9cd12688440aaa16a852fd1814b170c924a89fba4e084c8ea7522c34f9f5a539803d6237238e90", - "repos": { - "myrepo": { - "key": "myrepo", + "records": { + "myrecord": { + "key": "myrecord", "features": { "myfeature": "somevalue" } @@ -419,9 +419,9 @@ Sample response where the end of iteration has been reached. { "iterkey": null, - "repos": { - "anotherrepo": { - "key": "anotherrepo", + "records": { + "anotherrecord": { + "key": "anotherrecord", "features": { "myfeature": "othervalue" } @@ -501,8 +501,8 @@ Predict ~~~~~~~ To use a model for prediction, send a ``POST`` request to the following URL with -the body being a JSON object mapping ``key`` of the repo to the JSON -representation of :py:class:`dffml.repo.Repo` as received by the source repo +the body being a JSON object mapping ``key`` of the record to the JSON +representation of :py:class:`dffml.record.Record` as received by the source record endpoint. - ``/model/{ctx_label}/predict/0`` @@ -523,7 +523,7 @@ Sample response. { "iterkey": null, - "repos": { + "records": { "42": { "key": "42", "features": { diff --git a/service/http/examples/web/api.js b/service/http/examples/web/api.js index aedf34ac8d..1acf936b98 100644 --- a/service/http/examples/web/api.js +++ b/service/http/examples/web/api.js @@ -22,18 +22,18 @@ class DFFMLHTTPAPIObjectContext { } class DFFMLHTTPAPISourceContext extends DFFMLHTTPAPIObjectContext { - async repos(chunk_size) { + async records(chunk_size) { // TODO https://www.codementor.io/tiagolopesferreira/asynchronous-iterators-in-javascript-jl1yg8la1 - var response = await this.api.request("/source/" + this.label + "/repos/" + chunk_size); + var response = await this.api.request("/source/" + this.label + "/records/" + chunk_size); response = await response.json(); - return response.repos; + return response.records; } - async update(repo) { + async update(record) { // https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch - var response = await this.api.request("/source/" + this.label + "/update/" + repo.key, { + var response = await this.api.request("/source/" + this.label + "/update/" + record.key, { method: 'POST', // *GET, POST, PUT, DELETE, etc. mode: 'cors', // no-cors, cors, *same-origin cache: 'no-cache', // *default, no-cache, reload, force-cache, only-if-cached @@ -43,7 +43,7 @@ class DFFMLHTTPAPISourceContext extends DFFMLHTTPAPIObjectContext { }, redirect: 'follow', // manual, *follow, error referrer: 'no-referrer', // no-referrer, *client - body: JSON.stringify(repo), // body data type must match "Content-Type" header + body: JSON.stringify(record), // body data type must match "Content-Type" header }); await response.json(); @@ -72,7 +72,7 @@ class DFFMLHTTPAPIModelContext extends DFFMLHTTPAPIObjectContext { response = await response.json(); - return response.repos; + return response.records; } async accuracy(sources) { @@ -96,10 +96,10 @@ class DFFMLHTTPAPIModelContext extends DFFMLHTTPAPIObjectContext { response = await response.json(); - return response.repos; + return response.records; } - async predict(repos) { + async predict(records) { var response = await this.api.request("/model/" + this.label + "/predict/0", { method: 'POST', mode: 'cors', @@ -110,12 +110,12 @@ class DFFMLHTTPAPIModelContext extends DFFMLHTTPAPIObjectContext { }, redirect: 'follow', referrer: 'no-referrer', - body: JSON.stringify(repos), + body: JSON.stringify(records), }); response = await response.json(); - return response.repos; + return response.records; } } @@ -298,15 +298,15 @@ var runit = async function() { var test_sctx = await test_source.context("my_test_dataset_context"); console.log("Created test_sctx", test_sctx); - // Create an array of all the repos for fun - var repos = await training_sctx.repos(100); - console.log("Training repos", repos); + // Create an array of all the records for fun + var records = await training_sctx.records(100); + console.log("Training records", records); - var repos_array = []; - for (var key of Object.keys(repos)) { - repos_array.push(repos[key]); + var records_array = []; + for (var key of Object.keys(records)) { + records_array.push(records[key]); } - console.log("Array of training repos", repos_array); + console.log("Array of training records", records_array); // Create a model var model = api.model(); diff --git a/service/http/tests/test_routes.py b/service/http/tests/test_routes.py index c7caa745f4..2c59f6324c 100644 --- a/service/http/tests/test_routes.py +++ b/service/http/tests/test_routes.py @@ -10,7 +10,7 @@ import aiohttp from dffml.base import config -from dffml.repo import Repo +from dffml.record import Record from dffml.df.base import BaseConfig from dffml.operation.output import GetSingle from dffml.util.entrypoint import EntrypointNotFound @@ -56,24 +56,26 @@ class FakeModelConfig: class FakeModelContext(ModelContext): def __init__(self, parent): super().__init__(parent) - self.trained_on: Dict[str, Repo] = {} + self.trained_on: Dict[str, Record] = {} async def train(self, sources: Sources): - async for repo in sources.repos(): - self.trained_on[repo.key] = repo + async for record in sources.records(): + self.trained_on[record.key] = record async def accuracy(self, sources: Sources) -> Accuracy: accuracy: int = 0 - async for repo in sources.repos(): - accuracy += int(repo.key) + async for record in sources.records(): + accuracy += int(record.key) return Accuracy(accuracy) - async def predict(self, repos: AsyncIterator[Repo]) -> AsyncIterator[Repo]: - async for repo in repos: - repo.predicted( - "Salary", repo.feature("by_ten") * 10, float(repo.key) + async def predict( + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Record]: + async for record in records: + record.predicted( + "Salary", record.feature("by_ten") * 10, float(record.key) ) - yield repo + yield record @entrypoint("fake") @@ -128,9 +130,9 @@ async def post(self, path, *args, **kwargs): async def _add_memory_source(self): async with MemorySource( MemorySourceConfig( - repos=[ - Repo(str(i), data={"features": {"by_ten": i * 10}}) - for i in range(0, self.num_repos) + records=[ + Record(str(i), data={"features": {"by_ten": i * 10}}) + for i in range(0, self.num_records) ] ) ) as source: @@ -445,7 +447,7 @@ class TestRoutesSource(TestRoutesRunning, AsyncTestCase): async def setUp(self): await super().setUp() self.slabel: str = "mydataset" - self.num_repos: int = 100 + self.num_records: int = 100 self.add_memory_source = await self.exit_stack.enter_async_context( self._add_memory_source() ) @@ -454,74 +456,80 @@ async def test_source_not_found(self): with self.assertRaisesRegex( ServerException, list(SOURCE_NOT_LOADED.values())[0] ): - async with self.get("/source/non-existant/repo/key"): + async with self.get("/source/non-existant/record/key"): pass # pramga: no cov - async def test_repo(self): - for i in range(0, self.num_repos): - async with self.get(f"/source/{self.slabel}/repo/{i}") as r: + async def test_record(self): + for i in range(0, self.num_records): + async with self.get(f"/source/{self.slabel}/record/{i}") as r: self.assertEqual( - await r.json(), self.source.config.repos[i].export() + await r.json(), self.source.config.records[i].export() ) async def test_update(self): key = "1" - new_repo = Repo(key, data={"features": {"by_ten": 10}}) + new_record = Record(key, data={"features": {"by_ten": 10}}) async with self.post( - f"/source/{self.slabel}/update/{key}", json=new_repo.export() + f"/source/{self.slabel}/update/{key}", json=new_record.export() ) as r: self.assertEqual(await r.json(), OK) - self.assertEqual((await self.sctx.repo(key)).feature("by_ten"), 10) + self.assertEqual((await self.sctx.record(key)).feature("by_ten"), 10) def _check_iter_response(self, response): self.assertIn("iterkey", response) - self.assertIn("repos", response) - for key, repo in response["repos"].items(): - self.assertEqual(repo, self.source.config.repos[int(key)].export()) + self.assertIn("records", response) + for key, record in response["records"].items(): + self.assertEqual( + record, self.source.config.records[int(key)].export() + ) - async def test_repos(self): - chunk_size = self.num_repos - async with self.get(f"/source/{self.slabel}/repos/{chunk_size}") as r: + async def test_records(self): + chunk_size = self.num_records + async with self.get( + f"/source/{self.slabel}/records/{chunk_size}" + ) as r: response = await r.json() self._check_iter_response(response) self.assertEqual(response["iterkey"], None) - got = len(response["repos"].values()) + got = len(response["records"].values()) self.assertEqual( got, - self.num_repos, - f"Not all repos were received: got {got}, want: {self.num_repos}", + self.num_records, + f"Not all records were received: got {got}, want: {self.num_records}", ) - async def test_repos_iterkey(self): + async def test_records_iterkey(self): chunk_size = 7 - got_repos = {} - async with self.get(f"/source/{self.slabel}/repos/{chunk_size}") as r: + got_records = {} + async with self.get( + f"/source/{self.slabel}/records/{chunk_size}" + ) as r: response = await r.json() self._check_iter_response(response) iterkey = response["iterkey"] self.assertNotEqual(iterkey, None) - got_repos.update(response["repos"]) + got_records.update(response["records"]) while iterkey is not None: async with self.get( - f"/source/{self.slabel}/repos/{iterkey}/{chunk_size}" + f"/source/{self.slabel}/records/{iterkey}/{chunk_size}" ) as r: response = await r.json() self._check_iter_response(response) - got_repos.update(response["repos"]) + got_records.update(response["records"]) iterkey = response["iterkey"] - got = len(got_repos.keys()) + got = len(got_records.keys()) self.assertEqual( got, - self.num_repos, - f"Not all repos were received: got {got}, want: {self.num_repos}", + self.num_records, + f"Not all records were received: got {got}, want: {self.num_records}", ) - async def test_repos_iterkey_not_found(self): - chunk_size = self.num_repos + async def test_records_iterkey_not_found(self): + chunk_size = self.num_records iterkey = "feedface" with self.assertRaisesRegex(ServerException, "iterkey not found"): async with self.get( - f"/source/{self.slabel}/repos/{iterkey}/{chunk_size}" + f"/source/{self.slabel}/records/{iterkey}/{chunk_size}" ) as r: pass # pramga: no cov @@ -531,7 +539,7 @@ async def setUp(self): await super().setUp() self.mlabel: str = "mymodel" self.slabel: str = "mydataset" - self.num_repos: int = 100 + self.num_records: int = 100 self.add_memory_source = await self.exit_stack.enter_async_context( self._add_memory_source() ) @@ -573,7 +581,7 @@ async def test_train(self): f"/model/{self.mlabel}/train", json=[self.slabel] ) as r: self.assertEqual(await r.json(), OK) - for i in range(0, self.num_repos): + for i in range(0, self.num_records): self.assertIn(str(i), self.mctx.trained_on) async def test_accuracy(self): @@ -582,39 +590,39 @@ async def test_accuracy(self): ) as r: self.assertEqual( await r.json(), - {"accuracy": float(sum(range(0, self.num_repos)))}, + {"accuracy": float(sum(range(0, self.num_records)))}, ) async def test_predict(self): - repos: Dict[str, Repo] = { - repo.key: repo.export() async for repo in self.sctx.repos() + records: Dict[str, Record] = { + record.key: record.export() async for record in self.sctx.records() } async with self.post( - f"/model/{self.mlabel}/predict/0", json=repos + f"/model/{self.mlabel}/predict/0", json=records ) as r: i: int = 0 response = await r.json() - for key, repo_data in response["repos"].items(): - repo = Repo(key, data=repo_data) - self.assertEqual(int(repo.key), i) + for key, record_data in response["records"].items(): + record = Record(key, data=record_data) + self.assertEqual(int(record.key), i) self.assertEqual( - repo.feature("by_ten"), - repo.prediction("Salary").value / 10, + record.feature("by_ten"), + record.prediction("Salary").value / 10, ) self.assertEqual( - float(repo.key), repo.prediction("Salary").confidence + float(record.key), record.prediction("Salary").confidence ) i += 1 - self.assertEqual(i, self.num_repos) + self.assertEqual(i, self.num_records) async def test_predict_chunk_size_unsupported(self): - repos: Dict[str, Repo] = { - repo.key: repo.export() async for repo in self.sctx.repos() + records: Dict[str, Record] = { + record.key: record.export() async for record in self.sctx.records() } with self.assertRaisesRegex( ServerException, "Multiple request iteration not yet supported" ): async with self.post( - f"/model/{self.mlabel}/predict/7", json=repos + f"/model/{self.mlabel}/predict/7", json=records ) as r: pass # pramga: no cov diff --git a/source/mysql/dffml_source_mysql/source.py b/source/mysql/dffml_source_mysql/source.py index 9bb4e86106..57d23ffa03 100644 --- a/source/mysql/dffml_source_mysql/source.py +++ b/source/mysql/dffml_source_mysql/source.py @@ -5,7 +5,7 @@ import aiomysql from dffml.base import BaseConfig -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import BaseSourceContext, BaseSource from dffml.util.cli.arg import Arg from dffml.util.entrypoint import entrypoint @@ -18,75 +18,78 @@ class MySQLSourceConfig(BaseConfig, NamedTuple): password: str db: str update_query: str - repos_query: str - repo_query: str + records_query: str + record_query: str model_columns: List[str] ca: str = None class MySQLSourceContext(BaseSourceContext): - async def update(self, repo: Repo): + async def update(self, record: Record): update_query = self.parent.config.update_query model_columns = self.parent.config.model_columns.split() key_value_pairs = collections.OrderedDict() for key in model_columns: if key.startswith("feature_"): modified_key = key.replace("feature_", "") - key_value_pairs[modified_key] = repo.data.features[ + key_value_pairs[modified_key] = record.data.features[ modified_key ] elif "_value" in key: target = key.replace("_value", "") - if repo.data.prediction: - key_value_pairs[key] = repo.data.prediction[target][ + if record.data.prediction: + key_value_pairs[key] = record.data.prediction[target][ "value" ] else: key_value_pairs[key] = "undetermined" elif "_confidence" in key: target = key.replace("_confidence", "") - if repo.data.prediction: - key_value_pairs[key] = repo.data.prediction[target][ + if record.data.prediction: + key_value_pairs[key] = record.data.prediction[target][ "confidence" ] else: key_value_pairs[key] = 1 else: - key_value_pairs[key] = repo.data.__dict__[key] + key_value_pairs[key] = record.data.__dict__[key] db = self.conn await db.execute( update_query, (list(key_value_pairs.values()) + list(key_value_pairs.values())), ) - self.logger.debug("update: %s", await self.repo(repo.key)) + self.logger.debug("update: %s", await self.record(record.key)) - def convert_to_repo(self, result): - modified_repo = {"key": "", "data": {"features": {}, "prediction": {}}} + def convert_to_record(self, result): + modified_record = { + "key": "", + "data": {"features": {}, "prediction": {}}, + } for key, value in result.items(): if key.startswith("feature_"): - modified_repo["data"]["features"][ + modified_record["data"]["features"][ key.replace("feature_", "") ] = value elif ("_value" in key) or ("_confidence" in key): target = key.replace("_value", "").replace("_confidence", "") - modified_repo["data"]["prediction"][target] = { + modified_record["data"]["prediction"][target] = { "value": result[target + "_value"], "confidence": result[target + "_confidence"], } else: - modified_repo[key] = value - return Repo(modified_repo["key"], data=modified_repo["data"]) + modified_record[key] = value + return Record(modified_record["key"], data=modified_record["data"]) - async def repos(self) -> AsyncIterator[Repo]: - query = self.parent.config.repos_query + async def records(self) -> AsyncIterator[Record]: + query = self.parent.config.records_query await self.conn.execute(query) result = await self.conn.fetchall() - for repo in result: - yield self.convert_to_repo(repo) + for record in result: + yield self.convert_to_record(record) - async def repo(self, key: str): - query = self.parent.config.repo_query - repo = Repo(key) + async def record(self, key: str): + query = self.parent.config.record_query + record = Record(key) db = self.conn await db.execute(query, (key,)) row = await db.fetchone() @@ -103,13 +106,13 @@ async def repo(self, key: str): "value": row[target + "_value"], "confidence": row[target + "_confidence"], } - repo.merge( - Repo( + record.merge( + Record( row["key"], data={"features": features, "prediction": predictions}, ) ) - return repo + return record async def __aenter__(self) -> "MySQLSourceContext": self.__conn = self.parent.db.cursor(aiomysql.DictCursor) @@ -164,19 +167,19 @@ def args(cls, args, *above) -> Dict[str, Arg]: cls.config_set( args, above, - "repos-query", + "records-query", Arg( type=str, - help="SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM repo_data", + help="SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM record_data", ), ) cls.config_set( args, above, - "repo-query", + "record-query", Arg( type=str, - help="SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM repo_data WHERE `key`=%s", + help="SELECT `key` as key, data_1 as feature_1, data_2 as feature_2 FROM record_data WHERE `key`=%s", ), ) cls.config_set( @@ -185,7 +188,7 @@ def args(cls, args, *above) -> Dict[str, Arg]: "update-query", Arg( type=str, - help="INSERT INTO repo_data (`key`, data_1, data_2) VALUES(%s, %s, %s) ON DUPLICATE KEY UPDATE data_1 = %s, data_2=%s", + help="INSERT INTO record_data (`key`, data_1, data_2) VALUES(%s, %s, %s) ON DUPLICATE KEY UPDATE data_1 = %s, data_2=%s", ), ) cls.config_set( @@ -210,8 +213,8 @@ def config(cls, config, *above): user=cls.config_get(config, above, "user"), password=cls.config_get(config, above, "password"), db=cls.config_get(config, above, "db"), - repos_query=cls.config_get(config, above, "repos-query"), - repo_query=cls.config_get(config, above, "repo-query"), + records_query=cls.config_get(config, above, "records-query"), + record_query=cls.config_get(config, above, "record-query"), update_query=cls.config_get(config, above, "update-query"), model_columns=cls.config_get(config, above, "model-columns"), ca=cls.config_get(config, above, "ca"), diff --git a/source/mysql/tests/test_source.py b/source/mysql/tests/test_source.py index 7f2d06ce75..324768ae99 100644 --- a/source/mysql/tests/test_source.py +++ b/source/mysql/tests/test_source.py @@ -14,8 +14,8 @@ class TestMySQLSource(AsyncTestCase, SourceTest): SQL_SETUP = """ -DROP TABLE IF EXISTS `repo_data`; -CREATE TABLE `repo_data` ( +DROP TABLE IF EXISTS `record_data`; +CREATE TABLE `record_data` ( `key` varchar(100) NOT NULL, `feature_PetalLength` float DEFAULT NULL, `feature_PetalWidth` float DEFAULT NULL, @@ -41,9 +41,9 @@ def setUpClass(cls): user=DOCKER_ENV["MYSQL_USER"], password=DOCKER_ENV["MYSQL_PASSWORD"], db=DOCKER_ENV["MYSQL_DATABASE"], - repo_query="select * from repo_data where `key`=%s", - update_query="""insert into repo_data (`key`,`feature_PetalLength`,`feature_PetalWidth`, `feature_SepalLength`, `feature_SepalWidth`, `target_name_confidence`, `target_name_value`) values (%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE `key`=%s, `feature_PetalLength`=%s, `feature_PetalWidth`=%s, `feature_SepalLength`=%s, `feature_SepalWidth`=%s, `target_name_confidence`=%s, `target_name_value`=%s""", - repos_query="select * from repo_data", + record_query="select * from record_data where `key`=%s", + update_query="""insert into record_data (`key`,`feature_PetalLength`,`feature_PetalWidth`, `feature_SepalLength`, `feature_SepalWidth`, `target_name_confidence`, `target_name_value`) values (%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE `key`=%s, `feature_PetalLength`=%s, `feature_PetalWidth`=%s, `feature_SepalLength`=%s, `feature_SepalWidth`=%s, `target_name_confidence`=%s, `target_name_value`=%s""", + records_query="select * from record_data", model_columns="key feature_PetalLength feature_PetalWidth feature_SepalLength feature_SepalWidth target_name_confidence target_name_value", ca=cls.ca, ) diff --git a/tests/integration/test_cli.py b/tests/integration/test_cli.py index c3d88ceccb..8c2de68442 100644 --- a/tests/integration/test_cli.py +++ b/tests/integration/test_cli.py @@ -11,15 +11,15 @@ class TestList(IntegrationCLITestCase): - async def test_repos(self): + async def test_records(self): keys = ["A", "B", "C"] with contextlib.redirect_stdout(self.stdout): await CLI.cli( "list", - "repos", + "records", "-sources", "feed=memory", - "-source-repos", + "-source-records", *keys, ) stdout = self.stdout.getvalue() @@ -37,7 +37,7 @@ async def test_memory_to_json(self): "src=memory", "-source-dest-filename", filename, - "-source-src-repos", + "-source-src-records", *keys, "-source-src-allowempty", "-source-dest-allowempty", @@ -47,7 +47,7 @@ async def test_memory_to_json(self): with contextlib.redirect_stdout(self.stdout): await CLI.cli( "list", - "repos", + "records", "-sources", "tmp=json", "-source-tmp-filename", @@ -66,7 +66,7 @@ async def test_memory_to_csv(self): "src=memory", "-source-dest-filename", filename, - "-source-src-repos", + "-source-src-records", *keys, "-source-src-allowempty", "-source-dest-allowempty", diff --git a/tests/integration/test_dataflow.py b/tests/integration/test_dataflow.py index 9fc9689226..a8e1f01d25 100644 --- a/tests/integration/test_dataflow.py +++ b/tests/integration/test_dataflow.py @@ -90,7 +90,7 @@ async def test_dataflow_usage_example(self): orig = self.mktempfile() + ".json" pathlib.Path(orig).write_text(json.dumps(self.DATAFLOW.export())) # Import from feature/git - transform_to_repo = Operation.load("dffml.mapping.create") + transform_to_record = Operation.load("dffml.mapping.create") lines_of_code_by_language, lines_of_code_to_comments = list( load( "dffml_feature_git.feature.operations:lines_of_code_by_language", @@ -100,7 +100,7 @@ async def test_dataflow_usage_example(self): ) # Create new dataflow override = DataFlow.auto( - transform_to_repo, + transform_to_record, lines_of_code_by_language, lines_of_code_to_comments, ) diff --git a/tests/operation/test_dataflow.py b/tests/operation/test_dataflow.py index c953f5149f..ca3394816c 100644 --- a/tests/operation/test_dataflow.py +++ b/tests/operation/test_dataflow.py @@ -7,7 +7,7 @@ from ..test_df import DATAFLOW, add, mult, parse_line -class TestRunDataFlowOnRepo(AsyncTestCase): +class TestRunDataFlowOnRecord(AsyncTestCase): async def test_run(self): test_dataflow = DataFlow( operations={ diff --git a/tests/source/test_csv.py b/tests/source/test_csv.py index e5b2e8a322..8e9580f9d6 100644 --- a/tests/source/test_csv.py +++ b/tests/source/test_csv.py @@ -8,7 +8,7 @@ from dffml.source.csv import CSVSource, CSVSourceConfig from dffml.util.testing.source import FileSourceTest from dffml.util.asynctestcase import AsyncTestCase -from dffml.repo import Repo +from dffml.record import Record from dffml.util.cli.arg import parse_unknown @@ -29,17 +29,17 @@ async def test_tag(self): async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: await uctx.update( - Repo("0", data={"features": {"feed": 1}}) + Record("0", data={"features": {"feed": 1}}) ) await lctx.update( - Repo("0", data={"features": {"face": 2}}) + Record("0", data={"features": {"face": 2}}) ) # async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: - repo = await uctx.repo("0") - self.assertIn("feed", repo.features()) - repo = await lctx.repo("0") - self.assertIn("face", repo.features()) + record = await uctx.record("0") + self.assertIn("feed", record.features()) + record = await lctx.record("0") + self.assertIn("face", record.features()) with open(self.testfile, "r") as fd: dict_reader = csv.DictReader(fd, dialect="strip") rows = {row["tag"]: {row["key"]: row} for row in dict_reader} @@ -95,7 +95,7 @@ async def test_key(self): CSVSourceConfig(filename=fileobj.name, key="KeyHeader") ) as source: async with source() as sctx: - repo_a = await sctx.repo("a") - repo_b = await sctx.repo("b") - self.assertEqual(repo_a.feature("ValueColumn"), 42) - self.assertEqual(repo_b.feature("ValueColumn"), 420) + record_a = await sctx.record("a") + record_b = await sctx.record("b") + self.assertEqual(record_a.feature("ValueColumn"), 42) + self.assertEqual(record_b.feature("ValueColumn"), 420) diff --git a/tests/source/test_file.py b/tests/source/test_file.py index 655c44f8e4..3c0c8da9c1 100644 --- a/tests/source/test_file.py +++ b/tests/source/test_file.py @@ -4,7 +4,7 @@ from contextlib import contextmanager from typing import AsyncIterator -from dffml.repo import Repo +from dffml.record import Record from dffml.source.source import BaseSourceContext from dffml.source.file import FileSource, FileSourceConfig from dffml.util.cli.arg import Arg, parse_unknown @@ -12,13 +12,13 @@ class FakeFileSourceContext(BaseSourceContext): - async def update(self, repo: Repo): + async def update(self, record: Record): pass # pragma: no cover - async def repos(self) -> AsyncIterator[Repo]: - yield Repo("") # pragma: no cover + async def records(self) -> AsyncIterator[Record]: + yield Record("") # pragma: no cover - async def repo(self, key: str): + async def record(self, key: str): pass # pragma: no cover diff --git a/tests/source/test_idx.py b/tests/source/test_idx.py index 2ed95e27bc..c1e8c13c0a 100644 --- a/tests/source/test_idx.py +++ b/tests/source/test_idx.py @@ -34,10 +34,10 @@ async def test_idx1(self, filename): IDX1SourceConfig(filename=filename, feature=feature_name) ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), 60000) - self.assertIn(feature_name, repos[0].features()) - self.assertEqual(repos[0].feature(feature_name), 5) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), 60000) + self.assertIn(feature_name, records[0].features()) + self.assertEqual(records[0].feature(feature_name), 5) @cached_download(*IDX3_FILE, protocol_allowlist=["http://"]) async def test_idx3(self, filename): @@ -46,12 +46,14 @@ async def test_idx3(self, filename): IDX3SourceConfig(filename=filename, feature=feature_name) ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), 60000) - self.assertIn(feature_name, repos[0].features()) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), 60000) + self.assertIn(feature_name, records[0].features()) for i in range(-1, 1): with self.subTest(index=i): is_hash = hashlib.sha384( - json.dumps(repos[i].feature(feature_name)).encode() + json.dumps( + records[i].feature(feature_name) + ).encode() ).hexdigest() self.assertEqual(is_hash, IDX3_FIRST_LAST[i]) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9a14ba9dcf..aa75d17cf5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -10,7 +10,7 @@ from unittest.mock import patch from typing import List, AsyncIterator -from dffml.repo import Repo +from dffml.record import Record from dffml.feature import Feature, Features from dffml.source.source import Sources from dffml.source.file import FileSourceConfig @@ -30,24 +30,24 @@ from .test_df import OPERATIONS, OPIMPS -class ReposTestCase(AsyncExitStackTestCase): +class RecordsTestCase(AsyncExitStackTestCase): async def setUp(self): await super().setUp() - self.repos = [Repo(str(random.random())) for _ in range(0, 10)] + self.records = [Record(str(random.random())) for _ in range(0, 10)] self.temp_filename = self.mktempfile() self.sconfig = FileSourceConfig( filename=self.temp_filename, readwrite=True, allowempty=True ) async with JSONSource(self.sconfig) as source: async with source() as sctx: - for repo in self.repos: - await sctx.update(repo) + for record in self.records: + await sctx.update(record) contents = json.loads(Path(self.sconfig.filename).read_text()) - # Ensure there are repos in the file + # Ensure there are records in the file self.assertEqual( len(contents.get(self.sconfig.tag)), - len(self.repos), - "ReposTestCase JSON file erroneously initialized as empty", + len(self.records), + "RecordsTestCase JSON file erroneously initialized as empty", ) # TODO(p3) For some reason patching Model.load doesn't work # self._stack.enter_context(patch("dffml.model.model.Model.load", @@ -97,11 +97,13 @@ async def train(self, sources: Sources): async def accuracy(self, sources: Sources) -> AccuracyType: return AccuracyType(0.42) - async def predict(self, repos: AsyncIterator[Repo]) -> AsyncIterator[Repo]: + async def predict( + self, records: AsyncIterator[Record] + ) -> AsyncIterator[Record]: target = self.parent.config.predict.NAME - async for repo in repos: - repo.predicted(target, random.random(), float(repo.key)) - yield repo + async for record in records: + record.predicted(target, random.random(), float(record.key)) + yield record @entrypoint("fake") @@ -133,7 +135,7 @@ def opimp_load(loading=None): return OPIMPS -class TestMerge(ReposTestCase): +class TestMerge(RecordsTestCase): async def test_json_tag(self): await Merge.cli( "dest=json", @@ -155,16 +157,16 @@ async def test_json_tag(self): FileSourceConfig(filename=self.temp_filename) ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), len(self.repos)) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), len(self.records)) # Check the tagged source with self.subTest(tagged="sometag"): async with JSONSource( FileSourceConfig(filename=self.temp_filename, tag="sometag") ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), len(self.repos)) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), len(self.records)) async def test_json_to_csv(self): with non_existant_tempfile() as csv_tempfile: @@ -186,7 +188,9 @@ async def test_json_to_csv(self): self.assertEqual( contents, "key,tag\n" - + "\n".join([f"{repo.key},untagged" for repo in self.repos]) + + "\n".join( + [f"{record.key},untagged" for record in self.records] + ) + "\n", "Incorrect data in csv file", ) @@ -232,8 +236,8 @@ async def test_csv_tag(self): CSVSourceConfig(filename=csv_tempfile) ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), len(self.repos)) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), len(self.records)) contents = Path(csv_tempfile).read_text() self.assertIn("sometag", contents) self.assertIn("untagged", contents) @@ -243,19 +247,19 @@ async def test_csv_tag(self): CSVSourceConfig(filename=csv_tempfile, tag="sometag") ) as source: async with source() as sctx: - repos = [repo async for repo in sctx.repos()] - self.assertEqual(len(repos), len(self.repos)) + records = [record async for record in sctx.records()] + self.assertEqual(len(records), len(self.records)) contents = Path(csv_tempfile).read_text() self.assertIn("sometag", contents) self.assertIn("untagged", contents) -class TestListRepos(ReposTestCase): +class TestListRecords(RecordsTestCase): async def test_run(self): stdout = io.StringIO() with contextlib.redirect_stdout(stdout): result = await List.cli( - "repos", + "records", "-sources", "primary=json", "-source-primary-filename", @@ -263,19 +267,19 @@ async def test_run(self): "-source-primary-readwrite", "true", ) - for repo in self.repos: - self.assertIn(repo.key, stdout.getvalue()) + for record in self.records: + self.assertIn(record.key, stdout.getvalue()) -class TestDataflowRunAllRepos(ReposTestCase): +class TestDataflowRunAllRecords(RecordsTestCase): async def test_run(self): - self.repo_keys = {"add 40 and 2": 42, "multiply 42 and 10": 420} - self.repos = list(map(Repo, self.repo_keys.keys())) + self.record_keys = {"add 40 and 2": 42, "multiply 42 and 10": 420} + self.records = list(map(Record, self.record_keys.keys())) os.unlink(self.temp_filename) async with JSONSource(self.sconfig) as source: async with source() as sctx: - for repo in self.repos: - await sctx.update(repo) + for record in self.records: + await sctx.update(record) with tempfile.NamedTemporaryFile(suffix=".json") as dataflow_file: dataflow = io.StringIO() with contextlib.redirect_stdout(dataflow): @@ -289,7 +293,7 @@ async def test_run(self): dataflow_file.seek(0) results = await Dataflow.cli( "run", - "repos", + "records", "all", "-dataflow", dataflow_file.name, @@ -298,7 +302,7 @@ async def test_run(self): "primary=json", "-source-filename", self.temp_filename, - "-repo-def", + "-record-def", "calc_string", "-inputs", '["result"]=get_single_spec', @@ -306,21 +310,23 @@ async def test_run(self): results = { result.key: result.feature("result") for result in results } - for repo in self.repos: - self.assertIn(repo.key, results) - self.assertEqual(self.repo_keys[repo.key], results[repo.key]) + for record in self.records: + self.assertIn(record.key, results) + self.assertEqual( + self.record_keys[record.key], results[record.key] + ) -class TestDataflowRunRepoSet(ReposTestCase): +class TestDataflowRunRecordSet(RecordsTestCase): async def test_run(self): test_key = "multiply 42 and 10" - self.repo_keys = {"add 40 and 2": 42, "multiply 42 and 10": 420} - self.repos = list(map(Repo, self.repo_keys.keys())) + self.record_keys = {"add 40 and 2": 42, "multiply 42 and 10": 420} + self.records = list(map(Record, self.record_keys.keys())) os.unlink(self.temp_filename) async with JSONSource(self.sconfig) as source: async with source() as sctx: - for repo in self.repos: - await sctx.update(repo) + for record in self.records: + await sctx.update(record) with tempfile.NamedTemporaryFile(suffix=".json") as dataflow_file: dataflow = io.StringIO() with contextlib.redirect_stdout(dataflow): @@ -334,7 +340,7 @@ async def test_run(self): dataflow_file.seek(0) results = await Dataflow.cli( "run", - "repos", + "records", "set", "-keys", test_key, @@ -345,18 +351,18 @@ async def test_run(self): "primary=json", "-source-filename", self.temp_filename, - "-repo-def", + "-record-def", "calc_string", "-inputs", '["result"]=get_single_spec', ) self.assertEqual(len(results), 1) self.assertEqual( - self.repo_keys[test_key], results[0].feature("result") + self.record_keys[test_key], results[0].feature("result") ) -class TestTrain(ReposTestCase): +class TestTrain(RecordsTestCase): async def test_run(self): await Train.cli( "-sources", @@ -372,7 +378,7 @@ async def test_run(self): ) -class TestAccuracy(ReposTestCase): +class TestAccuracy(RecordsTestCase): async def test_run(self): result = await Accuracy.cli( "-sources", @@ -389,7 +395,7 @@ async def test_run(self): self.assertEqual(result, 0.42) -class TestPredict(ReposTestCase): +class TestPredict(RecordsTestCase): async def test_all(self): results = await Predict.cli( "all", @@ -405,16 +411,17 @@ async def test_all(self): "fake", ) results = { - repo.key: repo.prediction("fake").confidence for repo in results + record.key: record.prediction("fake").confidence + for record in results } - for repo in self.repos: - self.assertEqual(float(repo.key), results[repo.key]) + for record in self.records: + self.assertEqual(float(record.key), results[record.key]) - async def test_repo(self): - subset = self.repos[: (int(len(self.repos) / 2))] - subset_urls = list(map(lambda repo: repo.key, subset)) + async def test_record(self): + subset = self.records[: (int(len(self.records) / 2))] + subset_urls = list(map(lambda record: record.key, subset)) results = await Predict.cli( - "repo", + "record", "-sources", "primary=json", "-source-filename", @@ -430,7 +437,8 @@ async def test_repo(self): ) self.assertEqual(len(results), len(subset)) results = { - repo.key: repo.prediction("fake").confidence for repo in results + record.key: record.prediction("fake").confidence + for record in results } - for repo in subset: - self.assertEqual(float(repo.key), results[repo.key]) + for record in subset: + self.assertEqual(float(record.key), results[record.key]) diff --git a/tests/test_high_level.py b/tests/test_high_level.py index f665fd59eb..8c88f9aeb0 100644 --- a/tests/test_high_level.py +++ b/tests/test_high_level.py @@ -3,7 +3,7 @@ """ import importlib -from dffml.repo import Repo +from dffml.record import Record from dffml import train, accuracy, predict from dffml.source.csv import CSVSource from dffml.feature.feature import Features, DefFeature @@ -13,13 +13,13 @@ class TestML(IntegrationCLITestCase): - async def populate_source(self, source_cls, *repos, **kwargs): + async def populate_source(self, source_cls, *records, **kwargs): kwargs.setdefault("allowempty", True) kwargs.setdefault("readwrite", True) async with source_cls(**kwargs) as source: async with source() as sctx: - for repo in repos: - await sctx.update(repo) + for record in records: + await sctx.update(record) async def setUp(self): await super().setUp() @@ -32,14 +32,16 @@ async def setUp(self): self.test_data = [[4, 9, 1.0, 50], [5, 11, 1.2, 60]] self.predict_data = [[6, 13, 1.4], [7, 15, 1.6]] for use in ["train", "test", "predict"]: - repos = [ - Repo(i, data={"features": dict(zip(FEATURE_NAMES, features))}) + records = [ + Record( + i, data={"features": dict(zip(FEATURE_NAMES, features))} + ) for i, features in enumerate(getattr(self, f"{use}_data")) ] - setattr(self, f"{use}_repos", repos) + setattr(self, f"{use}_records", records) filename = self.mktempfile() + ".csv" setattr(self, f"{use}_filename", filename) - await self.populate_source(CSVSource, *repos, filename=filename) + await self.populate_source(CSVSource, *records, filename=filename) async def test_predict(self): self.required_plugins("dffml-model-scikit") diff --git a/tests/test_repo.py b/tests/test_record.py similarity index 86% rename from tests/test_repo.py rename to tests/test_record.py index ac575ba0f7..aa5579a5a3 100644 --- a/tests/test_repo.py +++ b/tests/test_record.py @@ -2,17 +2,17 @@ # Copyright (c) 2019 Intel Corporation import unittest -from dffml.repo import RepoPrediction, RepoData, Repo +from dffml.record import RecordPrediction, RecordData, Record -class TestRepoPrediction(unittest.TestCase): +class TestRecordPrediction(unittest.TestCase): def setUp(self): self.value = "good" self.confidence = 0.42 - self.full = RepoPrediction( + self.full = RecordPrediction( confidence=self.confidence, value=self.value ) - self.null = RepoPrediction() + self.null = RecordPrediction() def test_full_property_confidence(self): self.assertEqual(self.confidence, self.full["confidence"]) @@ -41,21 +41,21 @@ def test_null_bool_false(self): self.assertFalse(self.null) -class TestRepoData(unittest.TestCase): +class TestRecordData(unittest.TestCase): def setUp(self): - self.full = RepoData( + self.full = RecordData( key=None, features=None, prediction=None, last_updated=None ) - self.null = RepoData() + self.null = RecordData() def test_null_dict_no_prediction(self): self.assertNotIn("prediction", self.null.dict()) -class TestRepo(unittest.TestCase): +class TestRecord(unittest.TestCase): def setUp(self): - self.null = Repo("null") - self.full = Repo( + self.null = Record("null") + self.full = Record( "full", data=dict( features=dict(dead="beef"), @@ -72,10 +72,10 @@ def test_repr(self): repr(self.full) def test_str(self): - self.full.prediction = RepoPrediction() + self.full.prediction = RecordPrediction() self.assertIn("Undetermined", str(self.full)) self.full.data.prediction = { - "Prediction": RepoPrediction(value="Good") + "Prediction": RecordPrediction(value="Good") } self.assertIn("Good", str(self.full)) self.full.extra.update(dict(hi=5)) @@ -84,7 +84,7 @@ def test_str(self): self.assertNotIn("5", str(self.full)) def test_merge(self): - null = Repo("null") + null = Record("null") null.merge(self.full) self.assertIn("half", null.extra) self.assertTrue(null.extra["half"]) diff --git a/tests/util/test_cli.py b/tests/util/test_cli.py index e7dbc9d96c..7f29886bf8 100644 --- a/tests/util/test_cli.py +++ b/tests/util/test_cli.py @@ -7,7 +7,7 @@ import unittest from unittest.mock import patch -from dffml.repo import Repo +from dffml.record import Record from dffml.feature import Feature, Features from dffml.util.cli.arg import Arg, parse_unknown @@ -72,8 +72,8 @@ class UnregisteredObject(object): json.dumps(UnregisteredObject, cls=JSONEncoder), ) - def test_repo(self): - self.assertIn("face", json.dumps(Repo("face"), cls=JSONEncoder)) + def test_record(self): + self.assertIn("face", json.dumps(Record("face"), cls=JSONEncoder)) def test_feature(self): class FaceFeature(Feature):