diff --git a/build.cmd b/build.cmd index 95fc02d3..1cf885aa 100644 --- a/build.cmd +++ b/build.cmd @@ -399,7 +399,7 @@ if "%InstallPythonPackages%" == "True" ( call "%PythonExe%" -m pip install --upgrade pyzmq ) else ( call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33" - call "%PythonExe%" -m pip install --upgrade onnxruntime + call "%PythonExe%" -m pip install --upgrade --extra-index-url https://test.pypi.org/simple/ ort-nightly-featurizer ) call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" diff --git a/build.sh b/build.sh index 6ec53a75..97764b5a 100755 --- a/build.sh +++ b/build.sh @@ -283,6 +283,7 @@ then echo "#################################" echo "Installing Python packages ... " echo "#################################" + Wheel=${__currentScriptDir}/target/nimbusml-${ProductVersion}-${PythonTag}-none-${PlatName}.whl if [ ! -f ${Wheel} ] then @@ -301,7 +302,7 @@ then fi "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33" - "${PythonExe}" -m pip install --upgrade onnxruntime + "${PythonExe}" -m pip install --upgrade --extra-index-url https://test.pypi.org/simple/ ort-nightly-featurizer fi "${PythonExe}" -m pip install --upgrade "${Wheel}" "${PythonExe}" -m pip install "scikit-learn==0.19.2" diff --git a/nuget.config b/nuget.config index 9265d7b5..63e7a3be 100644 --- a/nuget.config +++ b/nuget.config @@ -5,5 +5,6 @@ + diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 31c27043..21121c02 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -32,21 +32,21 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - + + + + + + + + - - - - - - + + + + + + diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 8f58c7f0..d4c3dcb1 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -21,21 +21,21 @@ - - - - - - - + + + + + + + - - - - - - - + + + + + + + diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 2deae4ab..3433a902 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -92,6 +92,10 @@ + + + + @@ -332,8 +336,12 @@ + + + + @@ -422,6 +430,8 @@ + + @@ -468,6 +478,7 @@ + @@ -476,6 +487,7 @@ + @@ -741,14 +753,22 @@ + + + + + + + + @@ -756,6 +776,10 @@ + + + + diff --git a/src/python/nimbusml/examples/ForecastingPivot.py b/src/python/nimbusml/examples/ForecastingPivot.py new file mode 100644 index 00000000..b1592fbf --- /dev/null +++ b/src/python/nimbusml/examples/ForecastingPivot.py @@ -0,0 +1,31 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +import numpy as np +from nimbusml import FileDataStream, Pipeline +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import ForecastingPivot, RollingWindow + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double) + +# transform usage +xf = RollingWindow(columns={'age_1': 'age'}, + grain_columns=['education'], + window_calculation='Mean', + max_window_size=1, + horizon=1) + +xf1 = ForecastingPivot(columns_to_pivot=['age_1']) + +pipe = Pipeline([xf, xf1]) + +# fit and transform +features = pipe.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +print(features.head(100)) diff --git a/src/python/nimbusml/examples/LagLeadOperator.py b/src/python/nimbusml/examples/LagLeadOperator.py new file mode 100644 index 00000000..9424b2aa --- /dev/null +++ b/src/python/nimbusml/examples/LagLeadOperator.py @@ -0,0 +1,26 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +import numpy as np +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import LagLeadOperator + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double) + +# transform usage +xf = LagLeadOperator(columns={'age_1': 'age'}, + grain_columns=['education'], + offsets=[-3, 1], + horizon=1) + +# fit and transform +features = xf.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +print(features.head(100)) diff --git a/src/python/nimbusml/examples/RollingWindow.py b/src/python/nimbusml/examples/RollingWindow.py new file mode 100644 index 00000000..8fe9100e --- /dev/null +++ b/src/python/nimbusml/examples/RollingWindow.py @@ -0,0 +1,27 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +import numpy as np +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import RollingWindow + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double) + +# transform usage +xf = RollingWindow(columns={'age_1': 'age'}, + grain_columns=['education'], + window_calculation='Mean', + max_window_size=2, + horizon=2) + +# fit and transform +features = xf.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +print(features.head(100)) diff --git a/src/python/nimbusml/examples/ShortDrop.py b/src/python/nimbusml/examples/ShortDrop.py new file mode 100644 index 00000000..dd8882ab --- /dev/null +++ b/src/python/nimbusml/examples/ShortDrop.py @@ -0,0 +1,23 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +import numpy as np +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import ShortDrop + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double) + +# transform usage +xf = ShortDrop(grain_columns=['education'], min_rows=4294967294) << 'age' + +# fit and transform +features = xf.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +print(features.head(100)) diff --git a/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py b/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py new file mode 100644 index 00000000..c306f5bd --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ForecastingPivot +""" + +__all__ = ["ForecastingPivot"] + + +from ...entrypoints.transforms_forecastingpivot import \ + transforms_forecastingpivot +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ForecastingPivot(BasePipelineItem, DefaultSignature): + """ + **Description** + Pivots the input colums and drops any rows with N/A + + :param columns_to_pivot: List of columns to pivot. + + :param horizon_column_name: Name of the horizon column generated. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns_to_pivot, + horizon_column_name='Horizon', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.columns_to_pivot = columns_to_pivot + self.horizon_column_name = horizon_column_name + + @property + def _entrypoint(self): + return transforms_forecastingpivot + + @trace + def _get_node(self, **all_args): + algo_args = dict( + columns_to_pivot=self.columns_to_pivot, + horizon_column_name=self.horizon_column_name) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py b/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py new file mode 100644 index 00000000..991963ef --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py @@ -0,0 +1,100 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LagLeadOperator +""" + +__all__ = ["LagLeadOperator"] + + +from ...entrypoints.transforms_lagleadoperator import \ + transforms_lagleadoperator +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class LagLeadOperator(BasePipelineItem, DefaultSignature): + """ + **Description** + Uses the offset list with the horizon to create lags and leads + + :param grain_columns: List of grain columns. + + :param horizon: Maximum horizon value. + + :param offsets: Lag and Lead offset to use. A negative number is a lag, + positive is a lead. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + offsets, + horizon=0, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.grain_columns = grain_columns + self.offsets = offsets + self.horizon = horizon + + @property + def _entrypoint(self): + return transforms_lagleadoperator + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + grain_columns=self.grain_columns, + offsets=self.offsets, + horizon=self.horizon) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/rollingwindow.py b/src/python/nimbusml/internal/core/timeseries/rollingwindow.py new file mode 100644 index 00000000..3240bf6a --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/rollingwindow.py @@ -0,0 +1,108 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RollingWindow +""" + +__all__ = ["RollingWindow"] + + +from ...entrypoints.transforms_rollingwindow import transforms_rollingwindow +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class RollingWindow(BasePipelineItem, DefaultSignature): + """ + **Description** + Performs a calculation over a rolling timeseries window + + :param grain_columns: List of grain columns. + + :param horizon: Maximum horizon value. + + :param max_window_size: Maximum window size. + + :param min_window_size: Minimum window size. + + :param window_calculation: What window calculation to use. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + horizon=0, + max_window_size=0, + min_window_size=1, + window_calculation='0', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.grain_columns = grain_columns + self.horizon = horizon + self.max_window_size = max_window_size + self.min_window_size = min_window_size + self.window_calculation = window_calculation + + @property + def _entrypoint(self): + return transforms_rollingwindow + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + grain_columns=self.grain_columns, + horizon=self.horizon, + max_window_size=self.max_window_size, + min_window_size=self.min_window_size, + window_calculation=self.window_calculation) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/shortdrop.py b/src/python/nimbusml/internal/core/timeseries/shortdrop.py new file mode 100644 index 00000000..30313d7d --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/shortdrop.py @@ -0,0 +1,54 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ShortDrop +""" + +__all__ = ["ShortDrop"] + + +from ...entrypoints.transforms_shortdrop import transforms_shortdrop +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ShortDrop(BasePipelineItem, DefaultSignature): + """ + **Description** + Drops rows if there aren't enough values per grain. + + :param grain_columns: List of grain columns. + + :param min_rows: Minimum number of values required. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + min_rows=0, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.grain_columns = grain_columns + self.min_rows = min_rows + + @property + def _entrypoint(self): + return transforms_shortdrop + + @trace + def _get_node(self, **all_args): + algo_args = dict( + grain_columns=self.grain_columns, + min_rows=self.min_rows) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py b/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py new file mode 100644 index 00000000..fee7b30d --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py @@ -0,0 +1,73 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.ForecastingPivot +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_forecastingpivot( + columns_to_pivot, + data, + output_data=None, + model=None, + horizon_column_name='Horizon', + **params): + """ + **Description** + Pivots the input colums and drops any rows with N/A + + :param columns_to_pivot: List of columns to pivot (inputs). + :param horizon_column_name: Name of the horizon column generated. + (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.ForecastingPivot' + inputs = {} + outputs = {} + + if columns_to_pivot is not None: + inputs['ColumnsToPivot'] = try_set( + obj=columns_to_pivot, + none_acceptable=False, + is_of_type=list, + is_column=True) + if horizon_column_name is not None: + inputs['HorizonColumnName'] = try_set( + obj=horizon_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py b/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py new file mode 100644 index 00000000..88f63edd --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py @@ -0,0 +1,89 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.LagLeadOperator +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_lagleadoperator( + grain_columns, + column, + data, + offsets, + output_data=None, + model=None, + horizon=0, + **params): + """ + **Description** + Uses the offset list with the horizon to create lags and leads + + :param grain_columns: List of grain columns (inputs). + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param horizon: Maximum horizon value (inputs). + :param offsets: Lag and Lead offset to use. A negative number is + a lag, positive is a lead (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.LagLeadOperator' + inputs = {} + outputs = {} + + if grain_columns is not None: + inputs['GrainColumns'] = try_set( + obj=grain_columns, + none_acceptable=False, + is_of_type=list, + is_column=True) + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if horizon is not None: + inputs['Horizon'] = try_set( + obj=horizon, + none_acceptable=False, + is_of_type=numbers.Real) + if offsets is not None: + inputs['Offsets'] = try_set( + obj=offsets, + none_acceptable=False, + is_of_type=list) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py b/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py new file mode 100644 index 00000000..fb1eec3a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py @@ -0,0 +1,107 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.RollingWindow +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_rollingwindow( + grain_columns, + column, + data, + output_data=None, + model=None, + horizon=0, + max_window_size=0, + min_window_size=1, + window_calculation='0', + **params): + """ + **Description** + Performs a calculation over a rolling timeseries window + + :param grain_columns: List of grain columns (inputs). + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param horizon: Maximum horizon value (inputs). + :param max_window_size: Maximum window size (inputs). + :param min_window_size: Minimum window size (inputs). + :param window_calculation: What window calculation to use + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.RollingWindow' + inputs = {} + outputs = {} + + if grain_columns is not None: + inputs['GrainColumns'] = try_set( + obj=grain_columns, + none_acceptable=False, + is_of_type=list, + is_column=True) + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if horizon is not None: + inputs['Horizon'] = try_set( + obj=horizon, + none_acceptable=False, + is_of_type=numbers.Real) + if max_window_size is not None: + inputs['MaxWindowSize'] = try_set( + obj=max_window_size, + none_acceptable=False, + is_of_type=numbers.Real) + if min_window_size is not None: + inputs['MinWindowSize'] = try_set( + obj=min_window_size, + none_acceptable=False, + is_of_type=numbers.Real) + if window_calculation is not None: + inputs['WindowCalculation'] = try_set( + obj=window_calculation, + none_acceptable=False, + is_of_type=str, + values=[ + 'Mean', + 'Min', + 'Max']) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py b/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py new file mode 100644 index 00000000..8be2424d --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py @@ -0,0 +1,72 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.ShortDrop +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_shortdrop( + grain_columns, + data, + output_data=None, + model=None, + min_rows=0, + **params): + """ + **Description** + Drops rows if there aren't enough values per grain. + + :param grain_columns: List of grain columns (inputs). + :param min_rows: Minimum number of values required (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.ShortDrop' + inputs = {} + outputs = {} + + if grain_columns is not None: + inputs['GrainColumns'] = try_set( + obj=grain_columns, + none_acceptable=False, + is_of_type=list, + is_column=True) + if min_rows is not None: + inputs['MinRows'] = try_set( + obj=min_rows, + none_acceptable=False, + is_of_type=numbers.Real) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py b/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py new file mode 100644 index 00000000..cde9f3ce --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.timeseries import ForecastingPivot, RollingWindow + +# BUGS +# Removes NaN values? Record 0 is removed +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestForecastingPivot(unittest.TestCase): + + def test_simple_pivot(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + rw = RollingWindow(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=1, + horizon=1) + + xf1 = ForecastingPivot(columns_to_pivot=['ts_r']) + + pipe = Pipeline([rw, xf1]) + + result = pipe.fit_transform(df) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py b/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py new file mode 100644 index 00000000..f0609c1b --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py @@ -0,0 +1,86 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import math +import numpy as np +import pandas as pd +from nimbusml.timeseries import LagLeadOperator + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestLagLeadOperator(unittest.TestCase): + + def test_no_lag(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + ll = LagLeadOperator(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + offsets=[0], + horizon=1) + + result = ll.fit_transform(df) + + self.assertEqual(result.loc[0, 'ts_r'], 1) + self.assertEqual(result.loc[1, 'ts_r'], 3) + self.assertEqual(result.loc[2, 'ts_r'], 5) + self.assertEqual(result.loc[3, 'ts_r'], 7) + + def test_simple_horizon(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + ll = LagLeadOperator(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + offsets=[0], + horizon=2) + + result = ll.fit_transform(df) + + self.assertTrue(math.isnan(result.loc[0, 'ts_r.0'])) + self.assertEqual(result.loc[1, 'ts_r.0'], 1) + self.assertEqual(result.loc[2, 'ts_r.0'], 3) + self.assertEqual(result.loc[3, 'ts_r.0'], 5) + + self.assertEqual(result.loc[0, 'ts_r.1'], 1) + self.assertEqual(result.loc[1, 'ts_r.1'], 3) + self.assertEqual(result.loc[2, 'ts_r.1'], 5) + self.assertEqual(result.loc[3, 'ts_r.1'], 7) + + def test_simple_lag(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + ll = LagLeadOperator(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + offsets=[-1, 1], + horizon=1) + + result = ll.fit_transform(df) + + self.assertTrue(math.isnan(result.loc[0, 'ts_r.0'])) + self.assertEqual(result.loc[1, 'ts_r.0'], 1) + self.assertEqual(result.loc[2, 'ts_r.0'], 3) + self.assertEqual(result.loc[3, 'ts_r.0'], 5) + + self.assertEqual(result.loc[0, 'ts_r.1'], 3) + self.assertEqual(result.loc[1, 'ts_r.1'], 5) + self.assertEqual(result.loc[2, 'ts_r.1'], 7) + self.assertTrue(math.isnan(result.loc[3, 'ts_r.1'])) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_rollingwindow.py b/src/python/nimbusml/tests/timeseries/test_rollingwindow.py new file mode 100644 index 00000000..c64962f1 --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_rollingwindow.py @@ -0,0 +1,65 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import math +import numpy as np +import pandas as pd +from nimbusml.timeseries import RollingWindow + + +# BUGS +# Grain is only string? Fix the error message as in ShortDrop +# Horizon predicitons are not in correct order, see above +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestRollingWindow(unittest.TestCase): + + def test_simple_rolling_window(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + rw = RollingWindow(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=1, + horizon=2) + result = rw.fit_transform(df) + + self.assertTrue(math.isnan(result.loc[0, 'ts_r.1'])) + self.assertEqual(result.loc[1, 'ts_r.1'], 1) + self.assertEqual(result.loc[2, 'ts_r.1'], 3) + self.assertEqual(result.loc[3, 'ts_r.1'], 5) + + def test_simple_rolling_window2(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + rw = RollingWindow(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=2, + horizon=2) + result = rw.fit_transform(df) + + self.assertTrue(math.isnan(result.loc[0, 'ts_r.0'])) + self.assertTrue(math.isnan(result.loc[1, 'ts_r.0'])) + self.assertEqual(result.loc[2, 'ts_r.0'], 1) + self.assertEqual(result.loc[3, 'ts_r.0'], 2) + + self.assertTrue(math.isnan(result.loc[0, 'ts_r.1'])) + self.assertEqual(result.loc[1, 'ts_r.1'], 1) + self.assertEqual(result.loc[2, 'ts_r.1'], 2) + self.assertEqual(result.loc[3, 'ts_r.1'], 4) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_shortdrop.py b/src/python/nimbusml/tests/timeseries/test_shortdrop.py new file mode 100644 index 00000000..85fed23b --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_shortdrop.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +from nimbusml.timeseries import ShortDrop + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestShortDrop(unittest.TestCase): + + def test_no_drops(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + sd = ShortDrop(grain_columns=['grain'], min_rows=4) << 'ts' + result = sd.fit_transform(df) + pd.testing.assert_frame_equal(result, df) + + def test_drop_all(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + sd = ShortDrop(grain_columns=['grain'], min_rows=100) << 'ts' + result = sd.fit_transform(df) + self.assertEqual(len(result), 0) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py index 05dbfa3c..6476cbc2 100644 --- a/src/python/nimbusml/timeseries/__init__.py +++ b/src/python/nimbusml/timeseries/__init__.py @@ -4,6 +4,10 @@ from .ssachangepointdetector import SsaChangePointDetector from .ssaforecaster import SsaForecaster from .timeseriesimputer import TimeSeriesImputer +from .rollingwindow import RollingWindow +from .shortdrop import ShortDrop +from .lagleadoperator import LagLeadOperator +from .forecastingpivot import ForecastingPivot __all__ = [ 'IidSpikeDetector', @@ -11,5 +15,9 @@ 'SsaSpikeDetector', 'SsaChangePointDetector', 'SsaForecaster', - 'TimeSeriesImputer' + 'TimeSeriesImputer', + 'RollingWindow', + 'ShortDrop', + 'LagLeadOperator', + 'ForecastingPivot', ] diff --git a/src/python/nimbusml/timeseries/forecastingpivot.py b/src/python/nimbusml/timeseries/forecastingpivot.py new file mode 100644 index 00000000..c01cddba --- /dev/null +++ b/src/python/nimbusml/timeseries/forecastingpivot.py @@ -0,0 +1,58 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ForecastingPivot +""" + +__all__ = ["ForecastingPivot"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.forecastingpivot import \ + ForecastingPivot as core +from ..internal.utils.utils import trace + + +class ForecastingPivot(core, BaseTransform, TransformerMixin): + """ + **Description** + Pivots the input colums and drops any rows with N/A + + :param columns: see `Columns `_. + + :param columns_to_pivot: List of columns to pivot. + + :param horizon_column_name: Name of the horizon column generated. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns_to_pivot, + horizon_column_name='Horizon', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + columns_to_pivot=columns_to_pivot, + horizon_column_name=horizon_column_name, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/timeseries/lagleadoperator.py b/src/python/nimbusml/timeseries/lagleadoperator.py new file mode 100644 index 00000000..6a524114 --- /dev/null +++ b/src/python/nimbusml/timeseries/lagleadoperator.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LagLeadOperator +""" + +__all__ = ["LagLeadOperator"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.lagleadoperator import LagLeadOperator as core +from ..internal.utils.utils import trace + + +class LagLeadOperator(core, BaseTransform, TransformerMixin): + """ + **Description** + Uses the offset list with the horizon to create lags and leads + + :param columns: see `Columns `_. + + :param grain_columns: List of grain columns. + + :param horizon: Maximum horizon value. + + :param offsets: Lag and Lead offset to use. A negative number is a lag, + positive is a lead. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + offsets, + horizon=0, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + grain_columns=grain_columns, + offsets=offsets, + horizon=horizon, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/timeseries/rollingwindow.py b/src/python/nimbusml/timeseries/rollingwindow.py new file mode 100644 index 00000000..d1f228f6 --- /dev/null +++ b/src/python/nimbusml/timeseries/rollingwindow.py @@ -0,0 +1,69 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RollingWindow +""" + +__all__ = ["RollingWindow"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.rollingwindow import RollingWindow as core +from ..internal.utils.utils import trace + + +class RollingWindow(core, BaseTransform, TransformerMixin): + """ + **Description** + Performs a calculation over a rolling timeseries window + + :param columns: see `Columns `_. + + :param grain_columns: List of grain columns. + + :param horizon: Maximum horizon value. + + :param max_window_size: Maximum window size. + + :param min_window_size: Minimum window size. + + :param window_calculation: What window calculation to use. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + horizon=0, + max_window_size=0, + min_window_size=1, + window_calculation='0', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + grain_columns=grain_columns, + horizon=horizon, + max_window_size=max_window_size, + min_window_size=min_window_size, + window_calculation=window_calculation, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/timeseries/shortdrop.py b/src/python/nimbusml/timeseries/shortdrop.py new file mode 100644 index 00000000..c605ac0c --- /dev/null +++ b/src/python/nimbusml/timeseries/shortdrop.py @@ -0,0 +1,57 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ShortDrop +""" + +__all__ = ["ShortDrop"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.shortdrop import ShortDrop as core +from ..internal.utils.utils import trace + + +class ShortDrop(core, BaseTransform, TransformerMixin): + """ + **Description** + Drops rows if there aren't enough values per grain. + + :param columns: see `Columns `_. + + :param grain_columns: List of grain columns. + + :param min_rows: Minimum number of values required. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + grain_columns, + min_rows=0, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + grain_columns=grain_columns, + min_rows=min_rows, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index a9372ad3..3b0070af 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -201,6 +201,12 @@ 'check_estimators_pickle') OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf' +OMITTED_CHECKS_CLASS_ALWAYS = [ + 'RobustScaler', + 'LagLeadOperator', + 'ForecastingPivot', + 'RollingWindow', + 'ShortDrop'] NOBINARY_CHECKS = [ 'check_estimator_sparse_data', @@ -323,8 +329,10 @@ def method(self): failed_checks = set() passed_checks = set() class_name = epoint[1] - print("\n======== now Estimator is %s =========== " % class_name) + if class_name in OMITTED_CHECKS_CLASS_ALWAYS: + return + print("\n======== now Estimator is %s =========== " % class_name) mod = __import__('nimbusml.' + epoint[0], fromlist=[str(class_name)]) the_class = getattr(mod, class_name) if class_name in INSTANCES: diff --git a/src/python/tests_extended/test_dft_based.py b/src/python/tests_extended/test_dft_based.py new file mode 100644 index 00000000..5cc5d59d --- /dev/null +++ b/src/python/tests_extended/test_dft_based.py @@ -0,0 +1,395 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import os +import sys +import io +import platform +import tempfile +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.schema import ColumnSelector +from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter +from scipy.sparse import csr_matrix +from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop +from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey, + DateTimeSplitter, OnnxRunner) +import onnxruntime as rt +from data_frame_tool import DataFrameTool as DFT + +TEST_CASES = { + 'DateTimeSplitter_Simple', + 'DateTimeSplitter_Complex', + 'DateTimeSplitter_Canada_1day_before_christmas', + 'DateTimeSplitter_Czech_non_english_holiday', + 'ToKey_SimpleFloat', + 'ToKey_SimpleDouble', + 'ToKey_SimpleString', + 'ToKey_2col_Double', + 'ToKey_2col_Double_String', + 'ToString_Numbers', + 'ToString_Other_Types', + 'TimeSeriesImputer_1grain_2gap', + 'TimeSeriesImputer_1grain_2gap_backfill', + 'TimeSeriesImputer_1grain_2gap_medianfill', + 'TimeSeriesImputer_1grain_2gap', + 'TimeSeriesImputer_1grain_1gap_2filtercolumn', + 'TimeSeriesImputer_2grain_nogap', + 'ShortGrainDropper', + 'RollingWin_Pivot_Integration', + 'Laglead_Pivot_Integration', + 'Big_Test1', + 'Big_Test2' +} + +INSTANCES = { + 'DateTimeSplitter_Simple': Pipeline([ + DateTimeSplitter(prefix='dt') << 'tokens1', + ColumnSelector(drop_columns=[ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName' + ]) + ]), + 'DateTimeSplitter_Complex' : Pipeline([ + DateTimeSplitter(prefix='dt') << 'tokens1', + ColumnSelector(drop_columns=[ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName' + ]) + ]), + 'DateTimeSplitter_Canada_1day_before_christmas' : DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1', + 'DateTimeSplitter_Czech_non_english_holiday' : DateTimeSplitter(prefix='dt', country='Czech') << 'tokens1', + 'ToKey_SimpleFloat': ToKeyImputer(), + 'ToKey_SimpleDouble': ToKeyImputer(), + 'ToKey_SimpleString': ToKeyImputer(), + 'ToKey_2col_Double': ToKeyImputer(), + 'ToKey_2col_Double_String': ToKeyImputer(), + 'ToString_Numbers': ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}), + 'ToString_Other_Types': ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1'}), + 'TimeSeriesImputer_1grain_2gap': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'TimeSeriesImputer_1grain_2gap_backfill': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='BackFill', + filter_mode='Include'), + 'TimeSeriesImputer_1grain_2gap_medianfill': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='Median', + filter_mode='Include'), + 'TimeSeriesImputer_1grain_1gap_2filtercolumn': TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'TimeSeriesImputer_2grain_nogap': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'ShortGrainDropper': ShortDrop(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + min_rows=2), + # For RollingWindow with horizon as 2, max_window_size as 2 and we are calculating Mean, assume time frequency is 1 day + # output for each row would be [[mean of (2DaysBeforeYesterday and DayBeforeYesterday), mean of (DayBeforeYesterday and Yesterday)]] + # forecasting pivot will spread this 2d vector out and drop rows that have NaNs in it + 'RollingWin_Pivot_Integration': Pipeline([ + RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=2, + horizon=2), + ForecastingPivot(columns_to_pivot=['colA1']) + ]), + # For LagLeadOperator with horizon as 2 and offsets as [-2, -1], assume time frequency is 1 day + # output for each row would be [[2DaysBeforeYesterday, DayBeforeYesterday], + # [DayBeforeYesterday, Yesterday]] + # forecasting pivot will spread this 2d vector out and drop rows that have NaNs in it + 'Laglead_Pivot_Integration': Pipeline([ + LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1], + horizon=2), + ForecastingPivot(columns_to_pivot=['colA1']) + ]), + 'Big_Test1': Pipeline([ + TimeSeriesImputer(time_series_column='ts', + filter_columns=['c', 'grain'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + DateTimeSplitter(prefix='dt') << 'ts', + LagLeadOperator(columns={'c1': 'c'}, + grain_columns=['dtMonthLabel'], + offsets=[-2, -1], + horizon=1), + ForecastingPivot(columns_to_pivot=['c1']), + ColumnSelector(drop_columns=['dtHolidayName']) + ]), + 'Big_Test2': Pipeline([ + TimeSeriesImputer(time_series_column='ts', + filter_columns=['c', 'grain'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + DateTimeSplitter(prefix='dt', country = 'Canada') << 'ts', + RollingWindow(columns={'c1': 'c'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=2, + horizon=2), + ForecastingPivot(columns_to_pivot=['c1']) + ]) +} + +DATASETS = { + 'DateTimeSplitter_Simple': pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600] + )), + 'DateTimeSplitter_Complex': pd.DataFrame(data=dict( + tokens1=[217081624, 1751241600, 217081625, 32445842582] + )), + 'DateTimeSplitter_Canada_1day_before_christmas': pd.DataFrame(data=dict( + tokens1=[157161599] + )), + 'DateTimeSplitter_Czech_non_english_holiday': pd.DataFrame(data=dict( + tokens1=[3911760000, 3834432000, 3985200000] + )), + 'ToKey_SimpleFloat': pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.float64}), + 'ToKey_SimpleDouble': pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.double}), + 'ToKey_SimpleString': pd.DataFrame(data=dict( + target=["one", "one", "one", "two"] + )), + 'ToKey_2col_Double': pd.DataFrame(data=dict( + data1=[1.0, 1.0, 1.0, 2.0], + data2=[2.0, 2.0, 2.0, 3.0], + )).astype({'data1': np.double, + 'data1': np.double}), + 'ToKey_2col_Double_String': pd.DataFrame(data=dict( + data1=[1.0, 1.0, 1.0, 2.0], + data2=["two", "two", "three", "two"], + )), + 'ToString_Numbers': pd.DataFrame(data=dict( + f0= [4, 4, -1, 9], + f1= [5, 5, 3.1, -0.23], + f2= [6, 6.7, np.nan, np.nan] + )), + 'ToString_Other_Types': pd.DataFrame(data=dict( + f0= [True, False], + f1= [123.45, 135453984983490.5473] + )).astype({'f0': bool, + 'f1': np.double}), + 'TimeSeriesImputer_1grain_2gap': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'TimeSeriesImputer_1grain_2gap_backfill': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'TimeSeriesImputer_1grain_2gap_medianfill': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}), + 'TimeSeriesImputer_1grain_1gap_2filtercolumn': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] + )).astype({'ts': np.int64, 'grain': np.int32, 'c3': np.int32, 'c4': np.int32}), + 'TimeSeriesImputer_2grain_nogap': pd.DataFrame(data=dict( + ts=[1, 5, 2, 6], + grain=[1970, 1971, 1970, 1971], + c=[10, 11, 12, 13] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'ShortGrainDropper': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "two"] + )), + 'RollingWin_Pivot_Integration': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'Laglead_Pivot_Integration': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'Big_Test1': pd.DataFrame(data=dict( + ts=[217081624, 217081625, 217081627, 217081629], + grain=[1970, 1970, 1970, 1970], + c=[10, 11, 12, 13] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}), + 'Big_Test2': pd.DataFrame(data=dict( + ts=[0, 86400, 172800], + grain=[1970, 1970, 1970], + c=[10, 11, 12] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}) +} + +def get_file_size(file_path): + file_size = 0 + try: + file_size = os.path.getsize(file_path) + except: + pass + return file_size + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +class CaptureOutputContext(): + """ + Context which can be used for + capturing stdout and stderr. + """ + def __enter__(self): + self.orig_stdout = sys.stdout + self.orig_stderr = sys.stderr + self.stdout_capturer = io.StringIO() + self.stderr_capturer = io.StringIO() + sys.stdout = self.stdout_capturer + sys.stderr = self.stderr_capturer + return self + + def __exit__(self, *args): + sys.stdout = self.orig_stdout + sys.stderr = self.orig_stderr + self.stdout = self.stdout_capturer.getvalue() + self.stderr = self.stderr_capturer.getvalue() + + if self.stdout: + print(self.stdout) + + if self.stderr: + print(self.stderr) + + # free up some memory + del self.stdout_capturer + del self.stderr_capturer + +def validate_results(result_mlnet, result_ort): + + if len(result_ort.columns) != len(result_mlnet.columns): + raise RuntimeError("ERROR: The ORT output does not contain the same number of columns as ML.NET.") + col_tuples = list(zip(result_mlnet.columns[0:], + result_ort.columns[0:])) + for col_tuple in col_tuples: + try: + col_mlnet = result_mlnet.loc[:, col_tuple[0]] + col_ort = result_ort.loc[:, col_tuple[1]] + check_kwargs = { + 'check_names': False, + 'check_exact': True, + 'check_dtype': True, + 'check_less_precise': True + } + pd.testing.assert_series_equal(col_mlnet, col_ort, **check_kwargs) + except Exception as e: + print(e) + raise RuntimeError("ERROR: OnnxRunner result does not match ML.NET result.") + return True + +def export_to_onnx(estimator, test_case): + """ + Fit and test an estimator and determine + if it supports exporting to the ONNX format. + """ + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + + output = None + exported = False + export_valid = False + + try: + dataset = DATASETS.get(test_case) + + result_mlnet = estimator.fit_transform(dataset) + + with CaptureOutputContext() as output: + estimator.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + except Exception as e: + print(e) + + onnx_file_size = get_file_size(onnx_path) + onnx_json_file_size = get_file_size(onnx_json_path) + + if (output and + (onnx_file_size != 0) and + (onnx_json_file_size != 0) and + (not 'cannot save itself as ONNX' in output.stdout) and + (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)): + + exported = True + + try: + df_tool = DFT(onnx_path) + result_ort = df_tool.execute(dataset, []) + + export_valid = validate_results(result_mlnet, result_ort) + except Exception as e: + print(e) + + os.remove(onnx_path) + os.remove(onnx_json_path) + return {'exported': exported, 'export_valid': export_valid} + +class TestOnnxExport(unittest.TestCase): + + # This method is a static method of the class + # because there were pytest fixture related + # issues when the method was in the global scope. + @staticmethod + def generate_test_method(test_case): + def method(self): + estimator = INSTANCES[test_case] + + result = export_to_onnx(estimator, test_case) + assert result['exported'] + assert result['export_valid'] + + return method + +for test_case in TEST_CASES: + test_name = 'test_%s' % test_case.replace('(', '_').replace(')', '').lower() + + # The following test for far future time point. On Windows it's treated correctly as expected + # but for other OS, system_clock::time_point is defined as nanoseconds (64-bit), + # which rolls over somewhere around 2260. + if test_name in 'DateTimeSplitter_Complex' and (platform.system() == "Darwin" or platform.system() == "Linux"): + continue + + method = TestOnnxExport.generate_test_method(test_case) + setattr(TestOnnxExport, test_name, method) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py index c96e6988..54967fe2 100644 --- a/src/python/tests_extended/test_export_to_onnx.py +++ b/src/python/tests_extended/test_export_to_onnx.py @@ -140,6 +140,10 @@ 'RangeFilter', 'Resizer', 'RobustScaler', + 'RollingWindow', + 'ForecastingPivot', + 'LagLeadOperator', + 'ShortDrop', 'SkipFilter', 'SsaChangePointDetector', 'SsaForecaster', @@ -361,8 +365,22 @@ 'LogisticRegressionBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, 'LogisticRegressionClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, 'LpScaler': {'num_cols': 10, 'cols': 0}, - 'MeanVarianceScaler': {'num_cols': 5, 'cols': 0}, - 'MinMaxScaler': {'num_cols': 5, 'cols': 0}, + 'MeanVarianceScaler': { + 'num_cols': 5, + 'cols': [('Petal_Length', 'Petal_Length', 'Petal_Length.output'), + ('Petal_Width', 'Petal_Width', 'Petal_Width.output'), + ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output'), + ('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'), + ('Setosa', 'Setosa', 'Setosa.output')] + }, + 'MinMaxScaler': { + 'num_cols': 5, + 'cols': [('Petal_Length', 'Petal_Length', 'Petal_Length.output'), + ('Petal_Width', 'Petal_Width', 'Petal_Width.output'), + ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output'), + ('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'), + ('Setosa', 'Setosa', 'Setosa.output')] + }, 'MutualInformationSelector': {'num_cols': 8, 'cols': 0}, 'NGramFeaturizer': {'num_cols': 273, 'cols': 0}, 'NaiveBayesClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, diff --git a/src/python/tests_extended/test_tensor_based.py b/src/python/tests_extended/test_tensor_based.py new file mode 100644 index 00000000..3300a7e8 --- /dev/null +++ b/src/python/tests_extended/test_tensor_based.py @@ -0,0 +1,846 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import os +import platform +import tempfile +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.schema import ColumnSelector +from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter +from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop +from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey, + DateTimeSplitter, OnnxRunner) +import onnxruntime as rt +from data_frame_tool import DataFrameTool as DFT + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +def set_up_onnx_model(estimator, training_data): + estimator.fit(training_data) + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + estimator.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + return rt.InferenceSession(onnx_path) + +class TestAutoMLTransforms(unittest.TestCase): + def test_datetimesplitter(self): + training_data = pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600] + )) + + cols_to_drop = [ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' + ] + + dts = DateTimeSplitter(prefix='dt') << 'tokens1' + xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data = np.array([1, 2, 3, 157161600]).astype(np.int64).reshape(4,1) + result = sess.run(None, {"tokens1":inferencing_data}) + + expected_years = np.array([1970, 1970, 1970, 1974]).reshape(4, 1) + expected_month = np.array([1, 1, 1, 12]).reshape(4, 1) + expected_day = np.array([1, 1, 1, 25]).reshape(4, 1) + expected_hour = np.array([0, 0, 0, 0]).reshape(4, 1) + expected_minute = np.array([0, 0, 0, 0]).reshape(4, 1) + expected_second = np.array([1, 2, 3, 0]).reshape(4, 1) + expected_ampm = np.array([0, 0, 0, 0]).reshape(4, 1) + expected_holidayname = np.array(["", "", "", ""]).reshape(4, 1) + + np.testing.assert_array_equal(result[1],expected_years) + np.testing.assert_array_equal(result[2],expected_month) + np.testing.assert_array_equal(result[3],expected_day) + np.testing.assert_array_equal(result[4],expected_hour) + np.testing.assert_array_equal(result[5],expected_minute) + np.testing.assert_array_equal(result[6],expected_second) + np.testing.assert_array_equal(result[7],expected_ampm) + np.testing.assert_array_equal(result[8],expected_holidayname) + + def test_datetimesplitter_complex(self): + training_data = pd.DataFrame(data=dict( + tokens1=[217081624, 1751241600, 217081625] + )) + + cols_to_drop = [ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' + ] + + dts = DateTimeSplitter(prefix='dt') << 'tokens1' + xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data = np.array([217081624, 1751241600, 217081625]).astype(np.int64).reshape(3,1) + result = sess.run(None, {"tokens1": inferencing_data}) + + expected_years = np.array([1976, 2025, 1976]).reshape(3, 1) + expected_month = np.array([11, 6, 11]).reshape(3, 1) + expected_day = np.array([17, 30, 17]).reshape(3, 1) + expected_hour = np.array([12, 0, 12]).reshape(3, 1) + expected_minute = np.array([27, 0, 27]).reshape(3, 1) + expected_second = np.array([4, 0, 5]).reshape(3, 1) + expected_ampm = np.array([1, 0, 1]).reshape(3, 1) + expected_holidayname = np.array(["", "", ""]).reshape(3, 1) + + np.testing.assert_array_equal(result[1],expected_years) + np.testing.assert_array_equal(result[2],expected_month) + np.testing.assert_array_equal(result[3],expected_day) + np.testing.assert_array_equal(result[4],expected_hour) + np.testing.assert_array_equal(result[5],expected_minute) + np.testing.assert_array_equal(result[6],expected_second) + np.testing.assert_array_equal(result[7],expected_ampm) + np.testing.assert_array_equal(result[8],expected_holidayname) + + def test_tokey_simple_float(self): + + training_data = pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.float64}) + + xf = ToKeyImputer() + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data = np.array([1, float("NaN"), 4, 7, float("NaN")]).astype(np.float64).reshape(5,1) + result = sess.run(None, {"target": inferencing_data}) + + expectedData = np.array([1, 1, 4, 7, 1]).astype(np.float64).reshape(5, 1) + + np.testing.assert_array_equal(result[0],expectedData) + + def test_tokey_simple_double(self): + + training_data = pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.double}) + + xf = ToKeyImputer() + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data = np.array([1, float("NaN"), 4, 7, float("NaN")]).astype(np.double).reshape(5,1) + result = sess.run(None, {"target": inferencing_data}) + + expectedData = np.array([1, 1, 4, 7, 1]).astype(np.double).reshape(5, 1) + + np.testing.assert_array_equal(result[0],expectedData) + + def test_tokey_simple_string(self): + + training_data = pd.DataFrame(data=dict( + target=["one", "one", "one", "two"] + )) + + xf = ToKeyImputer() + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data = np.array(["1", "", "Hello", "", "World"]).reshape(5,1) + result = sess.run(None, {"target": inferencing_data}) + + expectedData = np.array(["1", "one", "Hello", "one", "World"]).reshape(5, 1) + + np.testing.assert_array_equal(result[0],expectedData) + + def test_tokey_2col_double(self): + + training_data = pd.DataFrame(data=dict( + data1=[1.0, 1.0, 1.0, 2.0], + data2=[2.0, 2.0, 2.0, 3.0], + )).astype({'data1': np.double, + 'data1': np.double}) + + xf = ToKeyImputer() + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data1 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1) + inferencing_data2 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1) + result = sess.run(None, {"data1": inferencing_data1, "data2": inferencing_data2}) + + expectedData1 = np.array([1.0, 1.0, 4.0, 7.0, 1.0]).astype(np.double).reshape(5, 1) + expectedData2 = np.array([1.0, 2.0, 4.0, 7.0, 2.0]).astype(np.double).reshape(5, 1) + + np.testing.assert_array_equal(result[0],expectedData1) + np.testing.assert_array_equal(result[1],expectedData2) + + def test_tokey_2col_double_string(self): + + training_data = pd.DataFrame(data=dict( + data1=[1.0, 1.0, 1.0, 2.0], + data2=["two", "two", "three", "two"], + )) + + xf = ToKeyImputer() + + sess = set_up_onnx_model(xf, training_data) + + inferencing_data1 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1) + inferencing_data2 = np.array(["1", "", "Hello", "", "World"]).reshape(5,1) + result = sess.run(None, {"data1": inferencing_data1, "data2": inferencing_data2}) + + expectedData1 = np.array([1.0, 1.0, 4.0, 7.0, 1.0]).astype(np.double).reshape(5, 1) + expectedData2 = np.array(["1", "two", "Hello", "two", "World"]).reshape(5, 1) + + np.testing.assert_array_equal(result[0],expectedData1) + np.testing.assert_array_equal(result[1],expectedData2) + + def test_tostring_numbers_wo_dft(self): + training_data = pd.DataFrame(data=dict( + f0=[4, 4, -1, 9], + f1=[5, 5, 3.1, -0.23], + f2=[6, 6.7, np.nan, np.nan] + )).astype({'f0': np.int32, + 'f1': np.float32, + 'f2': np.float64}) + + xf = ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_f0 = np.array([4, 4, -1, 9]).astype(np.int32).reshape(4,1) + inferencing_f1 = np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1) + inferencing_f2 = np.array([6, 6.7, float("NaN"), float("NaN")]).astype(np.float64).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"f0": inferencing_f0, "f1": inferencing_f1, "f2": inferencing_f2}) + + f0_output = np.array([['4'], ['4'], ['-1'], ['9']]).reshape(4, 1) + f1_output = np.array([['5.000000'], ['5.000000'], ['3.100000'], ['-0.230000']]).reshape(4, 1) + f2_output = np.array([['6.000000'], ['6.700000'], ['NaN'], ['NaN']]).reshape(4, 1) + + np.testing.assert_array_equal(f0_output, result[3]) + np.testing.assert_array_equal(f1_output, result[4]) + np.testing.assert_array_equal(f2_output, result[5]) + + def test_tostring_other_types_wo_dft(self): + training_data = pd.DataFrame(data=dict( + f0=[True, False], + f1=[123.45, 135453984983490.5473] + )).astype({'f0': bool, + 'f1': np.double}) + + xf = ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1'}) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_f0 = np.array([True, False]).astype(bool).reshape(2,1) + inferencing_f1 = np.array([123.45, 135453984983490.5473]).astype(np.double).reshape(2,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"f0": inferencing_f0, "f1": inferencing_f1}) + + f0_output = np.array([['True'], ['False']]).reshape(2, 1) + #This value, 135453984983490.5473, is changing due to precision and not being able to represent the input exactly. + f1_output = np.array([['123.450000'], ['135453984983490.546875']]).reshape(2, 1) + np.testing.assert_array_equal(f0_output, result[2]) + np.testing.assert_array_equal(f1_output, result[3]) + + def test_timeseriesimputer_onegrain_twogap(self): + + training_data = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}) + + xf = TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1) + inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1) + inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c}) + + expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1) + expected_c = np.array([[10], [11], [12], [12], [13], [13], [14]]).astype(np.single).reshape(7, 1) + expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1) + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_c, result[2]) + np.testing.assert_array_equal(expected_isrowimputed, result[3]) + + def test_timeseriesimputer_onegrain_twogap_backfill(self): + + training_data = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}) + + xf = TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='BackFill', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1) + inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1) + inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c}) + + expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1) + expected_c = np.array([[10], [11], [12], [13], [13], [14], [14]]).astype(np.single).reshape(7, 1) + expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1) + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_c, result[2]) + np.testing.assert_array_equal(expected_isrowimputed, result[3]) + + def test_timeseriesimputer_onegrain_twogap_backfill(self): + + training_data = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}) + + xf = TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='Median', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1) + inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1) + inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.double).reshape(5,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c}) + + expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1) + expected_c = np.array([[10], [11], [12], [12], [13], [12], [14]]).astype(np.single).reshape(7, 1) + expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1) + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_c, result[2]) + np.testing.assert_array_equal(expected_isrowimputed, result[3]) + + def test_timeseriesimputer_twograin_nogap(self): + + training_data = pd.DataFrame(data=dict( + ts=[1, 5, 2, 6], + grain=[1970, 1971, 1970, 1971], + c=[10, 11, 12, 13] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}) + + xf = TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([1, 5, 2, 6]).astype(np.int64).reshape(4,1) + inferencing_grain = np.array([1970, 1971, 1970, 1971]).astype(np.int32).reshape(4,1) + inferencing_c = np.array([10, 11, 12, 13]).astype(np.int32).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c}) + + expected_ts = np.array([[1], [5], [2], [6]]).astype(np.single).reshape(4, 1) + expected_grain = np.array([[1970], [1971], [1970], [1971]]).astype(np.single).reshape(4, 1) + expected_c = np.array([[10], [11], [12], [13]]).astype(np.single).reshape(4, 1) + expected_isrowimputed = np.array([[False], [False], [False], [False]]).astype(np.single).reshape(4, 1) + + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_grain, result[1]) + np.testing.assert_array_equal(expected_c, result[2]) + np.testing.assert_array_equal(expected_isrowimputed, result[3]) + + def test_timeseriesimputer_twograin_twogap(self): + + training_data = pd.DataFrame(data=dict( + ts=[0, 5, 1, 6], + grain=[1970, 1971, 1970, 1971], + c=[10, 11, 12, 13] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}) + + + xf = TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([0, 5, 3, 8]).astype(np.int64).reshape(4,1) + inferencing_grain = np.array([1970, 1971, 1970, 1971]).astype(np.int32).reshape(4,1) + inferencing_c = np.array([10, 11, 12, 13]).astype(np.int32).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c}) + + expected_ts = np.array([[0], [5], [1], [2], [3], [6], [7], [8]]).astype(np.single).reshape(8, 1) + expected_grain = np.array([[1970], [1971], [1970], [1970], [1970], [1971], [1971], [1971]]).astype(np.single).reshape(8, 1) + expected_c = np.array([[10], [11], [10], [10], [12], [11], [11], [13]]).astype(np.single).reshape(8, 1) + expected_isrowimputed = np.array([[False], [False], [True], [True], [False], [True], [True], [False]]).astype(np.single).reshape(8, 1) + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_grain, result[1]) + np.testing.assert_array_equal(expected_c, result[2]) + np.testing.assert_array_equal(expected_isrowimputed, result[3]) + + def test_timeseriesimputer_onegrain_onegap_two_filtercolumn(self): + + training_data = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] + )).astype({'ts': np.int64, 'grain': np.int32, 'c3': np.int32, 'c4': np.int32}) + + xf = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') + + sess = set_up_onnx_model(xf, training_data) + + inferencing_ts = np.array([1, 2, 3, 5]).astype(np.int64).reshape(4,1) + inferencing_grain = np.array([1970, 1970, 1970, 1970]).astype(np.int32).reshape(4,1) + inferencing_c3 = np.array([10, 13, 15, 20]).astype(np.int32).reshape(4,1) + inferencing_c4 = np.array([19, 12, 16, 19]).astype(np.int32).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, "c3": inferencing_c3, "c4": inferencing_c4}) + + expected_ts = np.array([[1], [2], [3], [4], [5]]).astype(np.single).reshape(5, 1) + expected_c3 = np.array([[10], [13], [15], [15], [20]]).astype(np.single).reshape(5, 1) + expected_c4 = np.array([[19], [12], [16], [16], [19]]).astype(np.single).reshape(5, 1) + expected_isrowimputed = np.array([[False], [False], [False], [True], [False]]).astype(np.single).reshape(5, 1) + + np.testing.assert_array_equal(expected_ts, result[0]) + np.testing.assert_array_equal(expected_c3, result[2]) + np.testing.assert_array_equal(expected_c4, result[3]) + np.testing.assert_array_equal(expected_isrowimputed, result[4]) + + def test_rolling_window_simple_mean_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=2, + horizon=2) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA }) + + expected = np.array([[[float("NaN"), float("NaN")]], [[float("NaN"), 1.0]], [[1.0, 1.5]], [[1.5, 2.5]]]).astype(np.single).reshape(4, 1, 2) + + np.testing.assert_array_equal(expected, result[2]) + + def test_rolling_window_simple_max_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Max', + max_window_size=1, + horizon=1) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1) + + np.testing.assert_array_equal(expected, result[2]) + + def test_rolling_window_simple_min_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Min', + max_window_size=1, + horizon=1) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA }) + + expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1) + + np.testing.assert_array_equal(expected, result[2]) + + def test_rolling_window_multiple_grains_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 1.0, 2.0], + grainA=["one", "one", "two", "two"] + )) + + xf = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=1, + horizon=1) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "two", "two"]).reshape(4,1) + inferencing_colA = np.array([1.0, 2.0, 1.0, 2.0]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[float("NaN")]], [[1.0]], [[float("NaN")]], [[1.0]]]).astype(np.single).reshape(4, 1, 1) + + np.testing.assert_array_equal(expected, result[2]) + + def test_rolling_window_non_string_grain_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=[True, True, True, True] + )) + + xf = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=1, + horizon=1) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array([True, True, True, True]).reshape(4,1) + inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1) + + np.testing.assert_array_equal(expected, result[2]) + + + def test_laglead_lag_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1], + horizon=2) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[float("NaN"), float("NaN")], [float("NaN"), float("NaN")]], + [[float("NaN"), float("NaN")], [float("NaN"), 1.0]], + [[float("NaN"), 1.0], [1.0, 2.0]], + [[1.0, 2.0], [2.0, 3.0]]]).astype(np.single).reshape(4, 2, 2) + + np.testing.assert_array_equal(expected, result[2]) + + def test_laglead_lead_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[1, 2], + horizon=2) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[1.0, 2.0], [2.0, 3.0]], + [[2.0, 3.0], [3.0, 4.0]], + [[3.0, 4.0], [4.0, float("NaN")]], + [[4.0, float("NaN")], [float("NaN"), float("NaN")]]]).astype(np.single).reshape(4, 2, 2) + + np.testing.assert_array_equal(expected, result[2]) + + def test_laglead_complex_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1, 1, 2], + horizon=2) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + expected = np.array([[[float("NaN"), float("NaN")], [float("NaN"), float("NaN")], [1.0, 2.0], [2.0, 3.0]], + [[float("NaN"), float("NaN")], [float("NaN"), 1.0], [2.0, 3.0], [3.0, 4.0]], + [[float("NaN"), 1.0], [1.0, 2.0], [3.0, 4.0], [4.0, float("NaN")]], + [[1.0, 2.0], [2.0, 3.0], [4.0, float("NaN")], [float("NaN"), float("NaN")]]]).astype(np.single).reshape(4, 4, 2) + + np.testing.assert_array_equal(expected, result[2]) + + def test_short_drop_wo_dft(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "two"] + )) + + xf = ShortDrop(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + min_rows=2) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "two"]).reshape(4,1) + inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA}) + + colA_expected = np.array([[1.0], [2.0], [3.0]]).reshape(3, 1) + grainA_expected = np.array([["one"], ["one"], ["one"]]).reshape(3, 1) + np.testing.assert_array_equal(colA_expected, result[0]) + np.testing.assert_array_equal(grainA_expected, result[1]) + + def test_integration_rollwin_pivot(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 3.0, 5.0, 7.0], + grainA=['1970', '1970', '1970', '1970'] + )) + + xf0 = RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=2, + horizon=2) + + xf1 = ForecastingPivot(columns_to_pivot=['colA1']) + + xf = Pipeline([xf0, xf1]) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_colA = np.array([1.0, 3.0, 5.0, 7.0]).reshape(4,1) + inferencing_grainA = np.array(['1970', '1970', '1970', '1970']).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"colA": inferencing_colA, "grainA": inferencing_grainA}) + + expected_colA = np.array([[3.0], [5.0], [5.0], [7.0], [7.0]]) + expected_grainA = np.array([['1970'], ['1970'], ['1970'], ['1970'], ['1970']]) + expected_output = np.array([[1.0], [1.0], [2.0], [2.0], [4.0]]) + np.testing.assert_array_equal(expected_colA, result[0]) + np.testing.assert_array_equal(expected_grainA, result[1]) + np.testing.assert_array_equal(expected_output, result[3]) + + def test_integration_laglead_pivot(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )) + + xf0 = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1], + horizon=2) + + xf1 = ForecastingPivot(columns_to_pivot=['colA1']) + + xf = Pipeline([xf0, xf1]) + + sess = set_up_onnx_model(xf, training_data) + + inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1) + inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1) + + # Run your inference session with your model and your data + result = sess.run(None, {"colA": inferencing_colA, "grainA": inferencing_grainA}) + + expected_colA = np.array([[3.0], [4.0], [4.0]]) + expected_grainA = np.array([['one'], ['one'], ['one']]) + expected_output_lag2 = np.array([[1.0], [1.0], [2.0]]) + expected_output_lag1 = np.array([[2.0], [2.0], [3.0]]) + np.testing.assert_array_equal(expected_colA, result[0]) + np.testing.assert_array_equal(expected_grainA, result[1]) + np.testing.assert_array_equal(expected_output_lag2, result[3]) + np.testing.assert_array_equal(expected_output_lag1, result[4]) + + def test_pivot_one_matrix(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0], + grainA=["one"] + )) + + xf0 = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1, 1], + horizon=4) + binarydata = xf0.fit_transform(training_data, as_binary_data_stream=True) + + xf1 = ForecastingPivot(columns_to_pivot=['colA1']) + + sess = set_up_onnx_model(xf1, binarydata) + + inferencing_grainA = np.array(["one"]).reshape(1,1) + inferencing_colA = np.array([1]).astype(np.double).reshape(1,1) + inferencing_colA1 = np.array([1, 4, 6, float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).astype(np.double).reshape(1,3,4) + + result = sess.run(None, {"grainA": inferencing_grainA,"colA": inferencing_colA, "colA1": inferencing_colA1}) + + expectedHorizon = np.array([4]).astype(np.uint32).reshape(1, 1) + expectedColA = np.array([1]).astype(np.double).reshape(1, 1) + expectedLag1 = np.array([1]).astype(np.double).reshape(1, 1) + expectedLead1 = np.array([2]).astype(np.double).reshape(1, 1) + expectedLag2 = np.array([3]).astype(np.double).reshape(1, 1) + + np.testing.assert_array_equal(result[0], expectedColA) + np.testing.assert_array_equal(result[2], expectedHorizon) + np.testing.assert_array_equal(result[3], expectedLag1) + np.testing.assert_array_equal(result[4], expectedLead1) + np.testing.assert_array_equal(result[5], expectedLag2) + + def test_pivot_two_matrix(self): + + training_data = pd.DataFrame(data=dict( + colA=[1.0, 2.0], + grainA=["one", "one"], + colB=[1.0, 3.0], + grainB=["one", "one"] + )) + + xf0 = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1, 1], + horizon=4) + + xf1 = LagLeadOperator(columns={'colB1': 'colB'}, + grain_columns=['grainB'], + offsets=[-2, -1], + horizon=4) + xf2 = Pipeline([xf0, xf1]) + binarydata = xf2.fit_transform(training_data, as_binary_data_stream=True) + + xf3 = ForecastingPivot(columns_to_pivot=['colA1', 'colB1']) + + sess = set_up_onnx_model(xf3, binarydata) + + inferencing_colA = np.array([1,2]).astype(np.double).reshape(2,1) + inferencing_colB = np.array([1,2]).astype(np.double).reshape(2,1) + inferencing_grainA = np.array(["one", "one"]).reshape(2,1) + inferencing_grainB = np.array(["one", "one"]).reshape(2,1) + inferencing_colA1 = np.array([1, 6, 3, 9, 2, 4, 5, 8, float("NaN"), float("NaN"), 7, 10, + 1, 6, 9, 3, 2, 4, 8, 5, float("NaN"), float("NaN"), 10, 7]).astype(np.double).reshape(2,3,4) + inferencing_colB1 = np.array([1, float("NaN"), 5, 6, 2, float("NaN"), 3, 4, + 1, float("NaN"), 6, 5, 2, float("NaN"), 4, 3]).astype(np.double).reshape(2,2,4) + + result = sess.run(None, {"colA": inferencing_colA, "colB": inferencing_colB, "grainA": inferencing_grainA, + "grainB": inferencing_grainB, "colA1": inferencing_colA1, "colB1": inferencing_colB1}) + + expectedColA = np.array([1, 1, 2, 2]).astype(np.double).reshape(4, 1) + expectedColB = np.array([1, 1, 2, 2]).astype(np.double).reshape(4, 1) + expectedHorizon = np.array([2, 1, 2, 1]).astype(np.uint32).reshape(4, 1) + expectedLag1 = np.array([3, 9, 9, 3]).astype(np.double).reshape(4, 1) + expectedLead1 = np.array([5, 8, 8, 5]).astype(np.double).reshape(4, 1) + expectedLag2 = np.array([7, 10, 10, 7]).astype(np.double).reshape(4, 1) + expectedVec2Lag2 = np.array([5, 6, 6, 5]).astype(np.double).reshape(4, 1) + expectedVec2Lag5 = np.array([3, 4, 4, 3]).astype(np.double).reshape(4, 1) + + np.testing.assert_array_equal(result[0], expectedColA) + np.testing.assert_array_equal(result[2], expectedColB) + np.testing.assert_array_equal(result[4], expectedHorizon) + np.testing.assert_array_equal(result[5], expectedLag1) + np.testing.assert_array_equal(result[6], expectedLead1) + np.testing.assert_array_equal(result[7], expectedLag2) + np.testing.assert_array_equal(result[8], expectedVec2Lag2) + np.testing.assert_array_equal(result[9], expectedVec2Lag5) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests_extended/test_tensor_invalid_input.py b/src/python/tests_extended/test_tensor_invalid_input.py new file mode 100644 index 00000000..d015b107 --- /dev/null +++ b/src/python/tests_extended/test_tensor_invalid_input.py @@ -0,0 +1,408 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import os +import sys +import io +import platform +import tempfile +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.schema import ColumnSelector +from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter +from scipy.sparse import csr_matrix +from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop +from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey, + DateTimeSplitter, OnnxRunner) +import onnxruntime as rt +from data_frame_tool import DataFrameTool as DFT + +TEST_CASES_FOR_INVALID_INPUT = { + 'DateTimeSplitter_Bad_Input_Data', + 'DateTimeSplitter_Bad_Input_Type', + 'DateTimeSplitter_Bad_Input_Shape', + 'ToKey_Bad_Input_Type', + 'ToKey_Bad_Input_Shape', + 'ToString_Bad_Input_Type', + 'ToString_Bad_Input_Shape', + 'TimeSeriesImputer_Bad_Input_Data', + 'TimeSeriesImputer_Bad_Input_Type', + 'TimeSeriesImputer_Bad_Input_Shape', + 'RollingWindow_Bad_Input_Type', + 'RollingWindow_Bad_Input_Shape', + 'LagLead_Bad_Input_Type', + 'LagLead_Bad_Input_Shape', + 'ShortDrop_Bad_Input_Type', + 'ShortDrop_Bad_Input_Shape', + 'ShortDrop_Drop_All' +} + +INSTANCES_FOR_INVALID_INPUT = { + 'DateTimeSplitter_Bad_Input_Data': Pipeline([ + DateTimeSplitter(prefix='dt') << 'tokens1', + ColumnSelector(drop_columns=[ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName' + ]) + ]), + 'DateTimeSplitter_Bad_Input_Type': Pipeline([ + DateTimeSplitter(prefix='dt') << 'tokens1', + ColumnSelector(drop_columns=[ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName' + ]) + ]), + 'DateTimeSplitter_Bad_Input_Shape': Pipeline([ + DateTimeSplitter(prefix='dt') << 'tokens1', + ColumnSelector(drop_columns=[ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName' + ]) + ]), + 'ToKey_Bad_Input_Type': ToKeyImputer(), + 'ToKey_Bad_Input_Shape': ToKeyImputer(), + 'ToString_Bad_Input_Type': ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}), + 'ToString_Bad_Input_Shape': ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}), + 'TimeSeriesImputer_Bad_Input_Data': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'TimeSeriesImputer_Bad_Input_Type': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'TimeSeriesImputer_Bad_Input_Shape': TimeSeriesImputer(time_series_column='ts', + filter_columns=['c'], + grain_columns=['grain'], + impute_mode='ForwardFill', + filter_mode='Include'), + 'RollingWindow_Bad_Input_Type': RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=2, + horizon=2), + 'RollingWindow_Bad_Input_Shape': RollingWindow(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + window_calculation='Mean', + max_window_size=2, + horizon=2), + 'LagLead_Bad_Input_Type': LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-1], + horizon=1), + 'LagLead_Bad_Input_Shape': LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-1], + horizon=1), + 'ShortDrop_Bad_Input_Type': ShortDrop(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + min_rows=2), + 'ShortDrop_Bad_Input_Shape': ShortDrop(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + min_rows=2), + 'ShortDrop_Drop_All': ShortDrop(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + min_rows=15) +} + +TRAINING_DATASETS_FOR_INVALID_INPUT = { + 'DateTimeSplitter_Bad_Input_Data': pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600] + )), + 'DateTimeSplitter_Bad_Input_Type': pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600] + )), + 'DateTimeSplitter_Bad_Input_Shape': pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600] + )), + 'ToKey_Bad_Input_Type': pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.float64}), + 'ToKey_Bad_Input_Shape': pd.DataFrame(data=dict( + target=[1.0, 1.0, 1.0, 2.0] + )).astype({'target': np.float64}), + 'ToString_Bad_Input_Type': pd.DataFrame(data=dict( + f0= [4, 4, -1, 9], + f1= [5, 5, 3.1, -0.23], + f2= [6, 6.7, np.nan, np.nan] + )), + 'ToString_Bad_Input_Shape': pd.DataFrame(data=dict( + f0= [4, 4, -1, 9], + f1= [5, 5, 3.1, -0.23], + f2= [6, 6.7, np.nan, np.nan] + )), + 'TimeSeriesImputer_Bad_Input_Data': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'TimeSeriesImputer_Bad_Input_Type': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'TimeSeriesImputer_Bad_Input_Shape': pd.DataFrame(data=dict( + ts=[1, 2, 3, 5, 7], + grain=[1970, 1970, 1970, 1970, 1970], + c=[10, 11, 12, 13, 14] + )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}), + 'RollingWindow_Bad_Input_Type': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'RollingWindow_Bad_Input_Shape': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'LagLead_Bad_Input_Type':pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'LagLead_Bad_Input_Shape': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "one"] + )), + 'ShortDrop_Bad_Input_Type': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "two"] + )), + 'ShortDrop_Bad_Input_Shape': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "two"] + )), + 'ShortDrop_Drop_All': pd.DataFrame(data=dict( + colA=[1.0, 2.0, 3.0, 4.0], + grainA=["one", "one", "one", "two"] + )) +} + +INFERENCE_DATASETS_FOR_INVALID_INPUT = { + 'DateTimeSplitter_Bad_Input_Data': {"tokens1": np.array([1, 2, 3, -3]).reshape(4,1)}, + 'DateTimeSplitter_Bad_Input_Type': {"tokens1": np.array([1, 2, 3, "3"]).reshape(4,1)}, + 'DateTimeSplitter_Bad_Input_Shape': {"tokens1": np.array([[1, 2, 3, 3], [1, 2, 3, 4]]).reshape(4,2)}, + 'ToKey_Bad_Input_Type': {"target": np.array([1, float("NaN"), "", 7, float("NaN")]).reshape(5,1)}, + 'ToKey_Bad_Input_Shape': {"target": np.array([[1, float("NaN")], [1, float("NaN")]]).reshape(2,2)}, + 'ToString_Bad_Input_Type': {"f0": np.array([4, 4, -1, 9]).astype(np.int32).reshape(4,1), + "f1": np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1), + "f2": np.array([6, "6.7", float("NaN"), float("NaN")]).reshape(4,1)}, + 'ToString_Bad_Input_Shape': {"f0": np.array([[4, 4, -1, 9],[4, 4, -1, 9]]).astype(np.int32).reshape(4,2), + "f1": np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1), + "f2": np.array([6, 6.7, float("NaN"), float("NaN")]).reshape(4,1)}, + 'TimeSeriesImputer_Bad_Input_Data': {"ts": np.array([1, 2, 3, -5, 7]).astype(np.int64).reshape(5,1), + "grain": np.array([1970, 1970, 1970, 1970, 1970]).reshape(5,1), + "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)}, + 'TimeSeriesImputer_Bad_Input_Type': {"ts": np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1), + "grain": np.array([1970, "1970", 1970, 1970, 1970]).reshape(5,1), + "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)}, + 'TimeSeriesImputer_Bad_Input_Shape': {"ts": np.array([[1, 2, 3, 5, 7], [1, 2, 3, 5, 7]]).astype(np.int64).reshape(5,2), + "grain": np.array([1970, 1970, 1970, 1970, 1970]).reshape(5,1), + "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)}, + 'RollingWindow_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1), + "colA": np.array([1.0, 2.0, 3.0, "4.0"]).reshape(4,1)}, + 'RollingWindow_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1), + "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)}, + 'LagLead_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1), + "colA": np.array([1, 2, 3, "4"]).reshape(4,1)}, + 'LagLead_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1), + "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)}, + 'ShortDrop_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1), + "colA": np.array([1, 2, 3, "4"]).reshape(4,1)}, + 'ShortDrop_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1), + "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)}, + 'ShortDrop_Drop_All': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1), + "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)} +} + +def get_file_size(file_path): + file_size = 0 + try: + file_size = os.path.getsize(file_path) + except: + pass + return file_size + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +class CaptureOutputContext(): + """ + Context which can be used for + capturing stdout and stderr. + """ + def __enter__(self): + self.orig_stdout = sys.stdout + self.orig_stderr = sys.stderr + self.stdout_capturer = io.StringIO() + self.stderr_capturer = io.StringIO() + sys.stdout = self.stdout_capturer + sys.stderr = self.stderr_capturer + return self + + def __exit__(self, *args): + sys.stdout = self.orig_stdout + sys.stderr = self.orig_stderr + self.stdout = self.stdout_capturer.getvalue() + self.stderr = self.stderr_capturer.getvalue() + + if self.stdout: + print(self.stdout) + + if self.stderr: + print(self.stderr) + + # free up some memory + del self.stdout_capturer + del self.stderr_capturer + +def validate_bad_input(self, estimator, test_case): + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + + exported = False + throw_expected_error = False + try: + dataset = TRAINING_DATASETS_FOR_INVALID_INPUT.get(test_case) + + estimator.fit(dataset) + + with CaptureOutputContext() as output: + estimator.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + except Exception as e: + print(e) + + onnx_file_size = get_file_size(onnx_path) + onnx_json_file_size = get_file_size(onnx_json_path) + + if (output and + (onnx_file_size != 0) and + (onnx_json_file_size != 0) and + (not 'cannot save itself as ONNX' in output.stdout) and + (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)): + + exported = True + + sess = rt.InferenceSession(onnx_path) + + with self.assertRaisesRegex(Exception, "ONNXRuntimeError"): + invalid_data = INFERENCE_DATASETS_FOR_INVALID_INPUT.get(test_case) + pred = sess.run(None, invalid_data) + + throw_expected_error = True + + os.remove(onnx_path) + os.remove(onnx_json_path) + return {'exported': exported, 'throw_expected_error': throw_expected_error} + +class TestOnnxExport(unittest.TestCase): + # This method is a static method of the class + # because there were pytest fixture related + # issues when the method was in the global scope. + @staticmethod + def generate_test_method_for_bad(test_case): + def method(self): + estimator = INSTANCES_FOR_INVALID_INPUT[test_case] + + result = validate_bad_input(self, estimator, test_case) + assert result['exported'] + assert result['throw_expected_error'] + + return method + + def test_pivot_bad_input_type(self): + + df = pd.DataFrame(data=dict( + colA=[1.0], + grainA=["one"] + )) + + xf0 = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1, 1], + horizon=4) + binarydata = xf0.fit_transform(df, as_binary_data_stream=True) + + xf1 = ForecastingPivot(columns_to_pivot=['colA1']) + + xf1.fit(binarydata) + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf1.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + sess = rt.InferenceSession(onnx_path) + + with self.assertRaisesRegex(Exception, "ONNXRuntimeError"): + grainA = np.array(["one"]).reshape(1,1) + colA = np.array([1]).astype(np.double).reshape(1,1) + colA1 = np.array([1, 4, "6", float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).reshape(1,3,4) + pred = sess.run(None, {"grainA":grainA,"colA":colA, "colA1":colA1 }) + + def test_pivot_bad_shape(self): + + df = pd.DataFrame(data=dict( + colA=[1.0], + grainA=["one"] + )) + + xf0 = LagLeadOperator(columns={'colA1': 'colA'}, + grain_columns=['grainA'], + offsets=[-2, -1, 1], + horizon=4) + binarydata = xf0.fit_transform(df, as_binary_data_stream=True) + + xf1 = ForecastingPivot(columns_to_pivot=['colA1']) + + xf1.fit(binarydata) + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf1.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + sess = rt.InferenceSession(onnx_path) + + with self.assertRaisesRegex(Exception, "ONNXRuntimeError"): + grainA = np.array(["one"]).reshape(1,1) + colA = np.array([1]).astype(np.double).reshape(1,1) + colA1 = np.array([1, 4, 6, float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).reshape(1,2,6) + pred = sess.run(None, {"grainA":grainA,"colA":colA, "colA1":colA1 }) + + + +for test_case_invalid_input in TEST_CASES_FOR_INVALID_INPUT: + test_name = 'test_%s' % test_case_invalid_input.replace('(', '_').replace(')', '').lower() + + # The following test for negative timepoints. On Windows it throws as expected. + # On Mac and Linux negative timepoints are a valid input. + if test_name in 'test_datetimesplitter_bad_input_data' and (platform.system() == "Darwin" or platform.system() == "Linux"): + continue + + method = TestOnnxExport.generate_test_method_for_bad(test_case_invalid_input) + setattr(TestOnnxExport, test_name, method) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests_extended/test_timeseries_automl.py b/src/python/tests_extended/test_timeseries_automl.py new file mode 100644 index 00000000..a6f6d73c --- /dev/null +++ b/src/python/tests_extended/test_timeseries_automl.py @@ -0,0 +1,203 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import os +import platform +import tempfile +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter +from nimbusml.preprocessing.schema import ColumnSelector +from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop +from data_frame_tool import DataFrameTool as DFT + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +class TestAutoMLTransforms(unittest.TestCase): + + def test_tostring(self): + data={'f0': [4, 4, -1, 9], + 'f1': [5, 5, 3.1, -0.23], + 'f2': [6, 6.7, np.nan, np.nan]} + data = pd.DataFrame(data).astype({'f0': np.int32, + 'f1': np.float32, + 'f2': np.float64}) + + xf = ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}) + xf.fit(data) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_tokeyimputer(self): + text_df = pd.DataFrame( + data=dict( + text=[ + "cat", + "dog", + "fish", + "orange", + "cat orange", + "dog", + "fish", + None, + "spider"])) + + xf = ToKeyImputer() << 'text' + xf.fit(text_df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_datetimesplitter(self): + df = pd.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600], + tokens2=[10, 11, 12, 13] + )) + + cols_to_drop = [ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' + ] + + dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' + xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) + xf.fit(df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_timeseriesimputer(self): + + df = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] + )) + + xf = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') + xf.fit(df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_shortdrop(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + xf = ShortDrop(grain_columns=['grain'], min_rows=4) << 'ts' + xf.fit(df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_rolling_window(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + xf = RollingWindow(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=1, + horizon=2) + xf.fit(df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_pivot(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + xf0 = RollingWindow(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + window_calculation='Mean', + max_window_size=1, + horizon=1) + + xf1 = ForecastingPivot(columns_to_pivot=['ts_r']) + + xf = Pipeline([xf0, xf1]) + + xf.fit(df) + + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + + def test_lag(self): + + df = pd.DataFrame(data=dict( + ts=[1.0, 3.0, 5.0, 7.0], + grain=['1970', '1970', '1970', '1970'], + )) + + xf = LagLeadOperator(columns={'ts_r': 'ts'}, + grain_columns=['grain'], + offsets=[-1, 1], + horizon=1) + + xf.fit(df) + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + xf.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index fd7f7950..48947d1a 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -19237,6 +19237,66 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.ForecastingPivot", + "Desc": "Pivots the input colums and drops any rows with N/A", + "FriendlyName": "ForecastingPivot", + "ShortName": "fpivot", + "Inputs": [ + { + "Name": "ColumnsToPivot", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of columns to pivot", + "Aliases": [ + "cols" + ], + "Required": true, + "SortOrder": 0.0, + "IsNullable": false + }, + { + "Name": "HorizonColumnName", + "Type": "String", + "Desc": "Name of the horizon column generated.", + "Aliases": [ + "hor" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": "Horizon" + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.GlobalContrastNormalizer", "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.", @@ -20466,6 +20526,122 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.LagLeadOperator", + "Desc": "Uses the offset list with the horizon to create lags and leads", + "FriendlyName": "LagLeadOperator", + "ShortName": "LagLead", + "Inputs": [ + { + "Name": "GrainColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of grain columns", + "Aliases": [ + "grains" + ], + "Required": true, + "SortOrder": 0.0, + "IsNullable": false + }, + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Horizon", + "Type": "UInt", + "Desc": "Maximum horizon value", + "Aliases": [ + "hor" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Offsets", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "Lag and Lead offset to use. A negative number is a lag, positive is a lead", + "Aliases": [ + "off" + ], + "Required": true, + "SortOrder": 3.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.LightLda", "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.", @@ -22840,6 +23016,151 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.RollingWindow", + "Desc": "Performs a calculation over a rolling timeseries window", + "FriendlyName": "Rolling Window Featurizer", + "ShortName": "RollingWindow", + "Inputs": [ + { + "Name": "GrainColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of grain columns", + "Aliases": [ + "grains" + ], + "Required": true, + "SortOrder": 0.0, + "IsNullable": false + }, + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Horizon", + "Type": "UInt", + "Desc": "Maximum horizon value", + "Aliases": [ + "hor" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "MaxWindowSize", + "Type": "UInt", + "Desc": "Maximum window size", + "Aliases": [ + "maxsize" + ], + "Required": true, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "MinWindowSize", + "Type": "UInt", + "Desc": "Minimum window size", + "Aliases": [ + "minsize" + ], + "Required": true, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 1 + }, + { + "Name": "WindowCalculation", + "Type": { + "Kind": "Enum", + "Values": [ + "Mean", + "Min", + "Max" + ] + }, + "Desc": "What window calculation to use", + "Aliases": [ + "calc" + ], + "Required": true, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "0" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.RowRangeFilter", "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", @@ -23279,6 +23600,66 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.ShortDrop", + "Desc": "Drops rows if there aren't enough values per grain.", + "FriendlyName": "ShortDrop", + "ShortName": "sgd", + "Inputs": [ + { + "Name": "GrainColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of grain columns", + "Aliases": [ + "grains" + ], + "Required": true, + "SortOrder": 0.0, + "IsNullable": false + }, + { + "Name": "MinRows", + "Type": "UInt", + "Desc": "Minimum number of values required", + "Aliases": [ + "minr" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.TensorFlowScorer", "Desc": "Transforms the data using the TensorFlow model.", diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index a70489ee..761ea70c 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -226,6 +226,30 @@ "XGBoost.TrainRegression" ], "EntryPoints": [ + { + "Name": "Transforms.RollingWindow", + "NewName": "RollingWindow", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "Transforms.LagLeadOperator", + "NewName": "LagLeadOperator", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "Transforms.ShortDrop", + "NewName": "ShortDrop", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "Transforms.ForecastingPivot", + "NewName": "ForecastingPivot", + "Module": "timeseries", + "Type": "Transform" + }, { "Name": "Trainers.AveragedPerceptronBinaryClassifier", "NewName": "AveragedPerceptronBinaryClassifier",