diff --git a/build.cmd b/build.cmd
index 95fc02d3..1cf885aa 100644
--- a/build.cmd
+++ b/build.cmd
@@ -399,7 +399,7 @@ if "%InstallPythonPackages%" == "True" (
call "%PythonExe%" -m pip install --upgrade pyzmq
) else (
call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33"
- call "%PythonExe%" -m pip install --upgrade onnxruntime
+ call "%PythonExe%" -m pip install --upgrade --extra-index-url https://test.pypi.org/simple/ ort-nightly-featurizer
)
call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%"
diff --git a/build.sh b/build.sh
index 6ec53a75..97764b5a 100755
--- a/build.sh
+++ b/build.sh
@@ -283,6 +283,7 @@ then
echo "#################################"
echo "Installing Python packages ... "
echo "#################################"
+
Wheel=${__currentScriptDir}/target/nimbusml-${ProductVersion}-${PythonTag}-none-${PlatName}.whl
if [ ! -f ${Wheel} ]
then
@@ -301,7 +302,7 @@ then
fi
"${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33"
- "${PythonExe}" -m pip install --upgrade onnxruntime
+ "${PythonExe}" -m pip install --upgrade --extra-index-url https://test.pypi.org/simple/ ort-nightly-featurizer
fi
"${PythonExe}" -m pip install --upgrade "${Wheel}"
"${PythonExe}" -m pip install "scikit-learn==0.19.2"
diff --git a/nuget.config b/nuget.config
index 9265d7b5..63e7a3be 100644
--- a/nuget.config
+++ b/nuget.config
@@ -5,5 +5,6 @@
+
diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj
index 31c27043..21121c02 100644
--- a/src/DotNetBridge/DotNetBridge.csproj
+++ b/src/DotNetBridge/DotNetBridge.csproj
@@ -32,21 +32,21 @@
all
runtime; build; native; contentfiles; analyzers
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj
index 8f58c7f0..d4c3dcb1 100644
--- a/src/Platforms/build.csproj
+++ b/src/Platforms/build.csproj
@@ -21,21 +21,21 @@
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index 2deae4ab..3433a902 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -92,6 +92,10 @@
+
+
+
+
@@ -332,8 +336,12 @@
+
+
+
+
@@ -422,6 +430,8 @@
+
+
@@ -468,6 +478,7 @@
+
@@ -476,6 +487,7 @@
+
@@ -741,14 +753,22 @@
+
+
+
+
+
+
+
+
@@ -756,6 +776,10 @@
+
+
+
+
diff --git a/src/python/nimbusml/examples/ForecastingPivot.py b/src/python/nimbusml/examples/ForecastingPivot.py
new file mode 100644
index 00000000..b1592fbf
--- /dev/null
+++ b/src/python/nimbusml/examples/ForecastingPivot.py
@@ -0,0 +1,31 @@
+###############################################################################
+# DateTimeSplitter
+import pandas as pd
+import numpy as np
+from nimbusml import FileDataStream, Pipeline
+from nimbusml.datasets import get_dataset
+from nimbusml.timeseries import ForecastingPivot, RollingWindow
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double)
+
+# transform usage
+xf = RollingWindow(columns={'age_1': 'age'},
+ grain_columns=['education'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=1)
+
+xf1 = ForecastingPivot(columns_to_pivot=['age_1'])
+
+pipe = Pipeline([xf, xf1])
+
+# fit and transform
+features = pipe.fit_transform(data)
+
+features = features.drop(['row_num', 'education', 'parity', 'induced',
+ 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1)
+
+# print features
+print(features.head(100))
diff --git a/src/python/nimbusml/examples/LagLeadOperator.py b/src/python/nimbusml/examples/LagLeadOperator.py
new file mode 100644
index 00000000..9424b2aa
--- /dev/null
+++ b/src/python/nimbusml/examples/LagLeadOperator.py
@@ -0,0 +1,26 @@
+###############################################################################
+# DateTimeSplitter
+import pandas as pd
+import numpy as np
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.timeseries import LagLeadOperator
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double)
+
+# transform usage
+xf = LagLeadOperator(columns={'age_1': 'age'},
+ grain_columns=['education'],
+ offsets=[-3, 1],
+ horizon=1)
+
+# fit and transform
+features = xf.fit_transform(data)
+
+features = features.drop(['row_num', 'education', 'parity', 'induced',
+ 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1)
+
+# print features
+print(features.head(100))
diff --git a/src/python/nimbusml/examples/RollingWindow.py b/src/python/nimbusml/examples/RollingWindow.py
new file mode 100644
index 00000000..8fe9100e
--- /dev/null
+++ b/src/python/nimbusml/examples/RollingWindow.py
@@ -0,0 +1,27 @@
+###############################################################################
+# DateTimeSplitter
+import pandas as pd
+import numpy as np
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.timeseries import RollingWindow
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double)
+
+# transform usage
+xf = RollingWindow(columns={'age_1': 'age'},
+ grain_columns=['education'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2)
+
+# fit and transform
+features = xf.fit_transform(data)
+
+features = features.drop(['row_num', 'education', 'parity', 'induced',
+ 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1)
+
+# print features
+print(features.head(100))
diff --git a/src/python/nimbusml/examples/ShortDrop.py b/src/python/nimbusml/examples/ShortDrop.py
new file mode 100644
index 00000000..dd8882ab
--- /dev/null
+++ b/src/python/nimbusml/examples/ShortDrop.py
@@ -0,0 +1,23 @@
+###############################################################################
+# DateTimeSplitter
+import pandas as pd
+import numpy as np
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.timeseries import ShortDrop
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.double)
+
+# transform usage
+xf = ShortDrop(grain_columns=['education'], min_rows=4294967294) << 'age'
+
+# fit and transform
+features = xf.fit_transform(data)
+
+features = features.drop(['row_num', 'education', 'parity', 'induced',
+ 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1)
+
+# print features
+print(features.head(100))
diff --git a/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py b/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py
new file mode 100644
index 00000000..c306f5bd
--- /dev/null
+++ b/src/python/nimbusml/internal/core/timeseries/forecastingpivot.py
@@ -0,0 +1,55 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ForecastingPivot
+"""
+
+__all__ = ["ForecastingPivot"]
+
+
+from ...entrypoints.transforms_forecastingpivot import \
+ transforms_forecastingpivot
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class ForecastingPivot(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Pivots the input colums and drops any rows with N/A
+
+ :param columns_to_pivot: List of columns to pivot.
+
+ :param horizon_column_name: Name of the horizon column generated.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ columns_to_pivot,
+ horizon_column_name='Horizon',
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.columns_to_pivot = columns_to_pivot
+ self.horizon_column_name = horizon_column_name
+
+ @property
+ def _entrypoint(self):
+ return transforms_forecastingpivot
+
+ @trace
+ def _get_node(self, **all_args):
+ algo_args = dict(
+ columns_to_pivot=self.columns_to_pivot,
+ horizon_column_name=self.horizon_column_name)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py b/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py
new file mode 100644
index 00000000..991963ef
--- /dev/null
+++ b/src/python/nimbusml/internal/core/timeseries/lagleadoperator.py
@@ -0,0 +1,100 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+LagLeadOperator
+"""
+
+__all__ = ["LagLeadOperator"]
+
+
+from ...entrypoints.transforms_lagleadoperator import \
+ transforms_lagleadoperator
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class LagLeadOperator(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Uses the offset list with the horizon to create lags and leads
+
+ :param grain_columns: List of grain columns.
+
+ :param horizon: Maximum horizon value.
+
+ :param offsets: Lag and Lead offset to use. A negative number is a lag,
+ positive is a lead.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ offsets,
+ horizon=0,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.grain_columns = grain_columns
+ self.offsets = offsets
+ self.horizon = horizon
+
+ @property
+ def _entrypoint(self):
+ return transforms_lagleadoperator
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i,
+ Name=o) for i,
+ o in zip(
+ input_columns,
+ output_columns)] if input_columns else None,
+ grain_columns=self.grain_columns,
+ offsets=self.offsets,
+ horizon=self.horizon)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/timeseries/rollingwindow.py b/src/python/nimbusml/internal/core/timeseries/rollingwindow.py
new file mode 100644
index 00000000..3240bf6a
--- /dev/null
+++ b/src/python/nimbusml/internal/core/timeseries/rollingwindow.py
@@ -0,0 +1,108 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+RollingWindow
+"""
+
+__all__ = ["RollingWindow"]
+
+
+from ...entrypoints.transforms_rollingwindow import transforms_rollingwindow
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class RollingWindow(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Performs a calculation over a rolling timeseries window
+
+ :param grain_columns: List of grain columns.
+
+ :param horizon: Maximum horizon value.
+
+ :param max_window_size: Maximum window size.
+
+ :param min_window_size: Minimum window size.
+
+ :param window_calculation: What window calculation to use.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ horizon=0,
+ max_window_size=0,
+ min_window_size=1,
+ window_calculation='0',
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.grain_columns = grain_columns
+ self.horizon = horizon
+ self.max_window_size = max_window_size
+ self.min_window_size = min_window_size
+ self.window_calculation = window_calculation
+
+ @property
+ def _entrypoint(self):
+ return transforms_rollingwindow
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i,
+ Name=o) for i,
+ o in zip(
+ input_columns,
+ output_columns)] if input_columns else None,
+ grain_columns=self.grain_columns,
+ horizon=self.horizon,
+ max_window_size=self.max_window_size,
+ min_window_size=self.min_window_size,
+ window_calculation=self.window_calculation)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/timeseries/shortdrop.py b/src/python/nimbusml/internal/core/timeseries/shortdrop.py
new file mode 100644
index 00000000..30313d7d
--- /dev/null
+++ b/src/python/nimbusml/internal/core/timeseries/shortdrop.py
@@ -0,0 +1,54 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ShortDrop
+"""
+
+__all__ = ["ShortDrop"]
+
+
+from ...entrypoints.transforms_shortdrop import transforms_shortdrop
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class ShortDrop(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Drops rows if there aren't enough values per grain.
+
+ :param grain_columns: List of grain columns.
+
+ :param min_rows: Minimum number of values required.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ min_rows=0,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.grain_columns = grain_columns
+ self.min_rows = min_rows
+
+ @property
+ def _entrypoint(self):
+ return transforms_shortdrop
+
+ @trace
+ def _get_node(self, **all_args):
+ algo_args = dict(
+ grain_columns=self.grain_columns,
+ min_rows=self.min_rows)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py b/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py
new file mode 100644
index 00000000..fee7b30d
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_forecastingpivot.py
@@ -0,0 +1,73 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.ForecastingPivot
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_forecastingpivot(
+ columns_to_pivot,
+ data,
+ output_data=None,
+ model=None,
+ horizon_column_name='Horizon',
+ **params):
+ """
+ **Description**
+ Pivots the input colums and drops any rows with N/A
+
+ :param columns_to_pivot: List of columns to pivot (inputs).
+ :param horizon_column_name: Name of the horizon column generated.
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.ForecastingPivot'
+ inputs = {}
+ outputs = {}
+
+ if columns_to_pivot is not None:
+ inputs['ColumnsToPivot'] = try_set(
+ obj=columns_to_pivot,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if horizon_column_name is not None:
+ inputs['HorizonColumnName'] = try_set(
+ obj=horizon_column_name,
+ none_acceptable=True,
+ is_of_type=str,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py b/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py
new file mode 100644
index 00000000..88f63edd
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_lagleadoperator.py
@@ -0,0 +1,89 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.LagLeadOperator
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_lagleadoperator(
+ grain_columns,
+ column,
+ data,
+ offsets,
+ output_data=None,
+ model=None,
+ horizon=0,
+ **params):
+ """
+ **Description**
+ Uses the offset list with the horizon to create lags and leads
+
+ :param grain_columns: List of grain columns (inputs).
+ :param column: New column definition (optional form: name:src)
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param horizon: Maximum horizon value (inputs).
+ :param offsets: Lag and Lead offset to use. A negative number is
+ a lag, positive is a lead (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.LagLeadOperator'
+ inputs = {}
+ outputs = {}
+
+ if grain_columns is not None:
+ inputs['GrainColumns'] = try_set(
+ obj=grain_columns,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if horizon is not None:
+ inputs['Horizon'] = try_set(
+ obj=horizon,
+ none_acceptable=False,
+ is_of_type=numbers.Real)
+ if offsets is not None:
+ inputs['Offsets'] = try_set(
+ obj=offsets,
+ none_acceptable=False,
+ is_of_type=list)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py b/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py
new file mode 100644
index 00000000..fb1eec3a
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_rollingwindow.py
@@ -0,0 +1,107 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.RollingWindow
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_rollingwindow(
+ grain_columns,
+ column,
+ data,
+ output_data=None,
+ model=None,
+ horizon=0,
+ max_window_size=0,
+ min_window_size=1,
+ window_calculation='0',
+ **params):
+ """
+ **Description**
+ Performs a calculation over a rolling timeseries window
+
+ :param grain_columns: List of grain columns (inputs).
+ :param column: New column definition (optional form: name:src)
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param horizon: Maximum horizon value (inputs).
+ :param max_window_size: Maximum window size (inputs).
+ :param min_window_size: Minimum window size (inputs).
+ :param window_calculation: What window calculation to use
+ (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.RollingWindow'
+ inputs = {}
+ outputs = {}
+
+ if grain_columns is not None:
+ inputs['GrainColumns'] = try_set(
+ obj=grain_columns,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if horizon is not None:
+ inputs['Horizon'] = try_set(
+ obj=horizon,
+ none_acceptable=False,
+ is_of_type=numbers.Real)
+ if max_window_size is not None:
+ inputs['MaxWindowSize'] = try_set(
+ obj=max_window_size,
+ none_acceptable=False,
+ is_of_type=numbers.Real)
+ if min_window_size is not None:
+ inputs['MinWindowSize'] = try_set(
+ obj=min_window_size,
+ none_acceptable=False,
+ is_of_type=numbers.Real)
+ if window_calculation is not None:
+ inputs['WindowCalculation'] = try_set(
+ obj=window_calculation,
+ none_acceptable=False,
+ is_of_type=str,
+ values=[
+ 'Mean',
+ 'Min',
+ 'Max'])
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py b/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py
new file mode 100644
index 00000000..8be2424d
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_shortdrop.py
@@ -0,0 +1,72 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.ShortDrop
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_shortdrop(
+ grain_columns,
+ data,
+ output_data=None,
+ model=None,
+ min_rows=0,
+ **params):
+ """
+ **Description**
+ Drops rows if there aren't enough values per grain.
+
+ :param grain_columns: List of grain columns (inputs).
+ :param min_rows: Minimum number of values required (inputs).
+ :param data: Input dataset (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.ShortDrop'
+ inputs = {}
+ outputs = {}
+
+ if grain_columns is not None:
+ inputs['GrainColumns'] = try_set(
+ obj=grain_columns,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if min_rows is not None:
+ inputs['MinRows'] = try_set(
+ obj=min_rows,
+ none_acceptable=False,
+ is_of_type=numbers.Real)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py b/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py
new file mode 100644
index 00000000..cde9f3ce
--- /dev/null
+++ b/src/python/nimbusml/tests/timeseries/test_forecastingpivot.py
@@ -0,0 +1,40 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.timeseries import ForecastingPivot, RollingWindow
+
+# BUGS
+# Removes NaN values? Record 0 is removed
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestForecastingPivot(unittest.TestCase):
+
+ def test_simple_pivot(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ rw = RollingWindow(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=1)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['ts_r'])
+
+ pipe = Pipeline([rw, xf1])
+
+ result = pipe.fit_transform(df)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py b/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py
new file mode 100644
index 00000000..f0609c1b
--- /dev/null
+++ b/src/python/nimbusml/tests/timeseries/test_lagleadoperator.py
@@ -0,0 +1,86 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import math
+import numpy as np
+import pandas as pd
+from nimbusml.timeseries import LagLeadOperator
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestLagLeadOperator(unittest.TestCase):
+
+ def test_no_lag(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ ll = LagLeadOperator(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ offsets=[0],
+ horizon=1)
+
+ result = ll.fit_transform(df)
+
+ self.assertEqual(result.loc[0, 'ts_r'], 1)
+ self.assertEqual(result.loc[1, 'ts_r'], 3)
+ self.assertEqual(result.loc[2, 'ts_r'], 5)
+ self.assertEqual(result.loc[3, 'ts_r'], 7)
+
+ def test_simple_horizon(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ ll = LagLeadOperator(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ offsets=[0],
+ horizon=2)
+
+ result = ll.fit_transform(df)
+
+ self.assertTrue(math.isnan(result.loc[0, 'ts_r.0']))
+ self.assertEqual(result.loc[1, 'ts_r.0'], 1)
+ self.assertEqual(result.loc[2, 'ts_r.0'], 3)
+ self.assertEqual(result.loc[3, 'ts_r.0'], 5)
+
+ self.assertEqual(result.loc[0, 'ts_r.1'], 1)
+ self.assertEqual(result.loc[1, 'ts_r.1'], 3)
+ self.assertEqual(result.loc[2, 'ts_r.1'], 5)
+ self.assertEqual(result.loc[3, 'ts_r.1'], 7)
+
+ def test_simple_lag(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ ll = LagLeadOperator(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ offsets=[-1, 1],
+ horizon=1)
+
+ result = ll.fit_transform(df)
+
+ self.assertTrue(math.isnan(result.loc[0, 'ts_r.0']))
+ self.assertEqual(result.loc[1, 'ts_r.0'], 1)
+ self.assertEqual(result.loc[2, 'ts_r.0'], 3)
+ self.assertEqual(result.loc[3, 'ts_r.0'], 5)
+
+ self.assertEqual(result.loc[0, 'ts_r.1'], 3)
+ self.assertEqual(result.loc[1, 'ts_r.1'], 5)
+ self.assertEqual(result.loc[2, 'ts_r.1'], 7)
+ self.assertTrue(math.isnan(result.loc[3, 'ts_r.1']))
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/timeseries/test_rollingwindow.py b/src/python/nimbusml/tests/timeseries/test_rollingwindow.py
new file mode 100644
index 00000000..c64962f1
--- /dev/null
+++ b/src/python/nimbusml/tests/timeseries/test_rollingwindow.py
@@ -0,0 +1,65 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import math
+import numpy as np
+import pandas as pd
+from nimbusml.timeseries import RollingWindow
+
+
+# BUGS
+# Grain is only string? Fix the error message as in ShortDrop
+# Horizon predicitons are not in correct order, see above
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestRollingWindow(unittest.TestCase):
+
+ def test_simple_rolling_window(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ rw = RollingWindow(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=2)
+ result = rw.fit_transform(df)
+
+ self.assertTrue(math.isnan(result.loc[0, 'ts_r.1']))
+ self.assertEqual(result.loc[1, 'ts_r.1'], 1)
+ self.assertEqual(result.loc[2, 'ts_r.1'], 3)
+ self.assertEqual(result.loc[3, 'ts_r.1'], 5)
+
+ def test_simple_rolling_window2(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ rw = RollingWindow(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2)
+ result = rw.fit_transform(df)
+
+ self.assertTrue(math.isnan(result.loc[0, 'ts_r.0']))
+ self.assertTrue(math.isnan(result.loc[1, 'ts_r.0']))
+ self.assertEqual(result.loc[2, 'ts_r.0'], 1)
+ self.assertEqual(result.loc[3, 'ts_r.0'], 2)
+
+ self.assertTrue(math.isnan(result.loc[0, 'ts_r.1']))
+ self.assertEqual(result.loc[1, 'ts_r.1'], 1)
+ self.assertEqual(result.loc[2, 'ts_r.1'], 2)
+ self.assertEqual(result.loc[3, 'ts_r.1'], 4)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/timeseries/test_shortdrop.py b/src/python/nimbusml/tests/timeseries/test_shortdrop.py
new file mode 100644
index 00000000..85fed23b
--- /dev/null
+++ b/src/python/nimbusml/tests/timeseries/test_shortdrop.py
@@ -0,0 +1,40 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml.timeseries import ShortDrop
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestShortDrop(unittest.TestCase):
+
+ def test_no_drops(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ sd = ShortDrop(grain_columns=['grain'], min_rows=4) << 'ts'
+ result = sd.fit_transform(df)
+ pd.testing.assert_frame_equal(result, df)
+
+ def test_drop_all(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ sd = ShortDrop(grain_columns=['grain'], min_rows=100) << 'ts'
+ result = sd.fit_transform(df)
+ self.assertEqual(len(result), 0)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py
index 05dbfa3c..6476cbc2 100644
--- a/src/python/nimbusml/timeseries/__init__.py
+++ b/src/python/nimbusml/timeseries/__init__.py
@@ -4,6 +4,10 @@
from .ssachangepointdetector import SsaChangePointDetector
from .ssaforecaster import SsaForecaster
from .timeseriesimputer import TimeSeriesImputer
+from .rollingwindow import RollingWindow
+from .shortdrop import ShortDrop
+from .lagleadoperator import LagLeadOperator
+from .forecastingpivot import ForecastingPivot
__all__ = [
'IidSpikeDetector',
@@ -11,5 +15,9 @@
'SsaSpikeDetector',
'SsaChangePointDetector',
'SsaForecaster',
- 'TimeSeriesImputer'
+ 'TimeSeriesImputer',
+ 'RollingWindow',
+ 'ShortDrop',
+ 'LagLeadOperator',
+ 'ForecastingPivot',
]
diff --git a/src/python/nimbusml/timeseries/forecastingpivot.py b/src/python/nimbusml/timeseries/forecastingpivot.py
new file mode 100644
index 00000000..c01cddba
--- /dev/null
+++ b/src/python/nimbusml/timeseries/forecastingpivot.py
@@ -0,0 +1,58 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ForecastingPivot
+"""
+
+__all__ = ["ForecastingPivot"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.timeseries.forecastingpivot import \
+ ForecastingPivot as core
+from ..internal.utils.utils import trace
+
+
+class ForecastingPivot(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Pivots the input colums and drops any rows with N/A
+
+ :param columns: see `Columns `_.
+
+ :param columns_to_pivot: List of columns to pivot.
+
+ :param horizon_column_name: Name of the horizon column generated.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ columns_to_pivot,
+ horizon_column_name='Horizon',
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ columns_to_pivot=columns_to_pivot,
+ horizon_column_name=horizon_column_name,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/timeseries/lagleadoperator.py b/src/python/nimbusml/timeseries/lagleadoperator.py
new file mode 100644
index 00000000..6a524114
--- /dev/null
+++ b/src/python/nimbusml/timeseries/lagleadoperator.py
@@ -0,0 +1,62 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+LagLeadOperator
+"""
+
+__all__ = ["LagLeadOperator"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.timeseries.lagleadoperator import LagLeadOperator as core
+from ..internal.utils.utils import trace
+
+
+class LagLeadOperator(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Uses the offset list with the horizon to create lags and leads
+
+ :param columns: see `Columns `_.
+
+ :param grain_columns: List of grain columns.
+
+ :param horizon: Maximum horizon value.
+
+ :param offsets: Lag and Lead offset to use. A negative number is a lag,
+ positive is a lead.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ offsets,
+ horizon=0,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ grain_columns=grain_columns,
+ offsets=offsets,
+ horizon=horizon,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/timeseries/rollingwindow.py b/src/python/nimbusml/timeseries/rollingwindow.py
new file mode 100644
index 00000000..d1f228f6
--- /dev/null
+++ b/src/python/nimbusml/timeseries/rollingwindow.py
@@ -0,0 +1,69 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+RollingWindow
+"""
+
+__all__ = ["RollingWindow"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.timeseries.rollingwindow import RollingWindow as core
+from ..internal.utils.utils import trace
+
+
+class RollingWindow(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Performs a calculation over a rolling timeseries window
+
+ :param columns: see `Columns `_.
+
+ :param grain_columns: List of grain columns.
+
+ :param horizon: Maximum horizon value.
+
+ :param max_window_size: Maximum window size.
+
+ :param min_window_size: Minimum window size.
+
+ :param window_calculation: What window calculation to use.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ horizon=0,
+ max_window_size=0,
+ min_window_size=1,
+ window_calculation='0',
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ grain_columns=grain_columns,
+ horizon=horizon,
+ max_window_size=max_window_size,
+ min_window_size=min_window_size,
+ window_calculation=window_calculation,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/timeseries/shortdrop.py b/src/python/nimbusml/timeseries/shortdrop.py
new file mode 100644
index 00000000..c605ac0c
--- /dev/null
+++ b/src/python/nimbusml/timeseries/shortdrop.py
@@ -0,0 +1,57 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ShortDrop
+"""
+
+__all__ = ["ShortDrop"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.timeseries.shortdrop import ShortDrop as core
+from ..internal.utils.utils import trace
+
+
+class ShortDrop(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Drops rows if there aren't enough values per grain.
+
+ :param columns: see `Columns `_.
+
+ :param grain_columns: List of grain columns.
+
+ :param min_rows: Minimum number of values required.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ grain_columns,
+ min_rows=0,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ grain_columns=grain_columns,
+ min_rows=min_rows,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py
index a9372ad3..3b0070af 100644
--- a/src/python/tests/test_estimator_checks.py
+++ b/src/python/tests/test_estimator_checks.py
@@ -201,6 +201,12 @@
'check_estimators_pickle')
OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf'
+OMITTED_CHECKS_CLASS_ALWAYS = [
+ 'RobustScaler',
+ 'LagLeadOperator',
+ 'ForecastingPivot',
+ 'RollingWindow',
+ 'ShortDrop']
NOBINARY_CHECKS = [
'check_estimator_sparse_data',
@@ -323,8 +329,10 @@ def method(self):
failed_checks = set()
passed_checks = set()
class_name = epoint[1]
- print("\n======== now Estimator is %s =========== " % class_name)
+ if class_name in OMITTED_CHECKS_CLASS_ALWAYS:
+ return
+ print("\n======== now Estimator is %s =========== " % class_name)
mod = __import__('nimbusml.' + epoint[0], fromlist=[str(class_name)])
the_class = getattr(mod, class_name)
if class_name in INSTANCES:
diff --git a/src/python/tests_extended/test_dft_based.py b/src/python/tests_extended/test_dft_based.py
new file mode 100644
index 00000000..5cc5d59d
--- /dev/null
+++ b/src/python/tests_extended/test_dft_based.py
@@ -0,0 +1,395 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import os
+import sys
+import io
+import platform
+import tempfile
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing.schema import ColumnSelector
+from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter
+from scipy.sparse import csr_matrix
+from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop
+from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey,
+ DateTimeSplitter, OnnxRunner)
+import onnxruntime as rt
+from data_frame_tool import DataFrameTool as DFT
+
+TEST_CASES = {
+ 'DateTimeSplitter_Simple',
+ 'DateTimeSplitter_Complex',
+ 'DateTimeSplitter_Canada_1day_before_christmas',
+ 'DateTimeSplitter_Czech_non_english_holiday',
+ 'ToKey_SimpleFloat',
+ 'ToKey_SimpleDouble',
+ 'ToKey_SimpleString',
+ 'ToKey_2col_Double',
+ 'ToKey_2col_Double_String',
+ 'ToString_Numbers',
+ 'ToString_Other_Types',
+ 'TimeSeriesImputer_1grain_2gap',
+ 'TimeSeriesImputer_1grain_2gap_backfill',
+ 'TimeSeriesImputer_1grain_2gap_medianfill',
+ 'TimeSeriesImputer_1grain_2gap',
+ 'TimeSeriesImputer_1grain_1gap_2filtercolumn',
+ 'TimeSeriesImputer_2grain_nogap',
+ 'ShortGrainDropper',
+ 'RollingWin_Pivot_Integration',
+ 'Laglead_Pivot_Integration',
+ 'Big_Test1',
+ 'Big_Test2'
+}
+
+INSTANCES = {
+ 'DateTimeSplitter_Simple': Pipeline([
+ DateTimeSplitter(prefix='dt') << 'tokens1',
+ ColumnSelector(drop_columns=[
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName'
+ ])
+ ]),
+ 'DateTimeSplitter_Complex' : Pipeline([
+ DateTimeSplitter(prefix='dt') << 'tokens1',
+ ColumnSelector(drop_columns=[
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName'
+ ])
+ ]),
+ 'DateTimeSplitter_Canada_1day_before_christmas' : DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1',
+ 'DateTimeSplitter_Czech_non_english_holiday' : DateTimeSplitter(prefix='dt', country='Czech') << 'tokens1',
+ 'ToKey_SimpleFloat': ToKeyImputer(),
+ 'ToKey_SimpleDouble': ToKeyImputer(),
+ 'ToKey_SimpleString': ToKeyImputer(),
+ 'ToKey_2col_Double': ToKeyImputer(),
+ 'ToKey_2col_Double_String': ToKeyImputer(),
+ 'ToString_Numbers': ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'}),
+ 'ToString_Other_Types': ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1'}),
+ 'TimeSeriesImputer_1grain_2gap': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_1grain_2gap_backfill': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='BackFill',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_1grain_2gap_medianfill': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='Median',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_1grain_1gap_2filtercolumn': TimeSeriesImputer(time_series_column='ts',
+ grain_columns=['grain'],
+ filter_columns=['c3', 'c4'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_2grain_nogap': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'ShortGrainDropper': ShortDrop(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ min_rows=2),
+ # For RollingWindow with horizon as 2, max_window_size as 2 and we are calculating Mean, assume time frequency is 1 day
+ # output for each row would be [[mean of (2DaysBeforeYesterday and DayBeforeYesterday), mean of (DayBeforeYesterday and Yesterday)]]
+ # forecasting pivot will spread this 2d vector out and drop rows that have NaNs in it
+ 'RollingWin_Pivot_Integration': Pipeline([
+ RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2),
+ ForecastingPivot(columns_to_pivot=['colA1'])
+ ]),
+ # For LagLeadOperator with horizon as 2 and offsets as [-2, -1], assume time frequency is 1 day
+ # output for each row would be [[2DaysBeforeYesterday, DayBeforeYesterday],
+ # [DayBeforeYesterday, Yesterday]]
+ # forecasting pivot will spread this 2d vector out and drop rows that have NaNs in it
+ 'Laglead_Pivot_Integration': Pipeline([
+ LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1],
+ horizon=2),
+ ForecastingPivot(columns_to_pivot=['colA1'])
+ ]),
+ 'Big_Test1': Pipeline([
+ TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c', 'grain'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ DateTimeSplitter(prefix='dt') << 'ts',
+ LagLeadOperator(columns={'c1': 'c'},
+ grain_columns=['dtMonthLabel'],
+ offsets=[-2, -1],
+ horizon=1),
+ ForecastingPivot(columns_to_pivot=['c1']),
+ ColumnSelector(drop_columns=['dtHolidayName'])
+ ]),
+ 'Big_Test2': Pipeline([
+ TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c', 'grain'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ DateTimeSplitter(prefix='dt', country = 'Canada') << 'ts',
+ RollingWindow(columns={'c1': 'c'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2),
+ ForecastingPivot(columns_to_pivot=['c1'])
+ ])
+}
+
+DATASETS = {
+ 'DateTimeSplitter_Simple': pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600]
+ )),
+ 'DateTimeSplitter_Complex': pd.DataFrame(data=dict(
+ tokens1=[217081624, 1751241600, 217081625, 32445842582]
+ )),
+ 'DateTimeSplitter_Canada_1day_before_christmas': pd.DataFrame(data=dict(
+ tokens1=[157161599]
+ )),
+ 'DateTimeSplitter_Czech_non_english_holiday': pd.DataFrame(data=dict(
+ tokens1=[3911760000, 3834432000, 3985200000]
+ )),
+ 'ToKey_SimpleFloat': pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.float64}),
+ 'ToKey_SimpleDouble': pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.double}),
+ 'ToKey_SimpleString': pd.DataFrame(data=dict(
+ target=["one", "one", "one", "two"]
+ )),
+ 'ToKey_2col_Double': pd.DataFrame(data=dict(
+ data1=[1.0, 1.0, 1.0, 2.0],
+ data2=[2.0, 2.0, 2.0, 3.0],
+ )).astype({'data1': np.double,
+ 'data1': np.double}),
+ 'ToKey_2col_Double_String': pd.DataFrame(data=dict(
+ data1=[1.0, 1.0, 1.0, 2.0],
+ data2=["two", "two", "three", "two"],
+ )),
+ 'ToString_Numbers': pd.DataFrame(data=dict(
+ f0= [4, 4, -1, 9],
+ f1= [5, 5, 3.1, -0.23],
+ f2= [6, 6.7, np.nan, np.nan]
+ )),
+ 'ToString_Other_Types': pd.DataFrame(data=dict(
+ f0= [True, False],
+ f1= [123.45, 135453984983490.5473]
+ )).astype({'f0': bool,
+ 'f1': np.double}),
+ 'TimeSeriesImputer_1grain_2gap': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'TimeSeriesImputer_1grain_2gap_backfill': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'TimeSeriesImputer_1grain_2gap_medianfill': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}),
+ 'TimeSeriesImputer_1grain_1gap_2filtercolumn': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5],
+ grain=[1970, 1970, 1970, 1970],
+ c3=[10, 13, 15, 20],
+ c4=[19, 12, 16, 19]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c3': np.int32, 'c4': np.int32}),
+ 'TimeSeriesImputer_2grain_nogap': pd.DataFrame(data=dict(
+ ts=[1, 5, 2, 6],
+ grain=[1970, 1971, 1970, 1971],
+ c=[10, 11, 12, 13]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'ShortGrainDropper': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "two"]
+ )),
+ 'RollingWin_Pivot_Integration': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'Laglead_Pivot_Integration': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'Big_Test1': pd.DataFrame(data=dict(
+ ts=[217081624, 217081625, 217081627, 217081629],
+ grain=[1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double}),
+ 'Big_Test2': pd.DataFrame(data=dict(
+ ts=[0, 86400, 172800],
+ grain=[1970, 1970, 1970],
+ c=[10, 11, 12]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double})
+}
+
+def get_file_size(file_path):
+ file_size = 0
+ try:
+ file_size = os.path.getsize(file_path)
+ except:
+ pass
+ return file_size
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+class CaptureOutputContext():
+ """
+ Context which can be used for
+ capturing stdout and stderr.
+ """
+ def __enter__(self):
+ self.orig_stdout = sys.stdout
+ self.orig_stderr = sys.stderr
+ self.stdout_capturer = io.StringIO()
+ self.stderr_capturer = io.StringIO()
+ sys.stdout = self.stdout_capturer
+ sys.stderr = self.stderr_capturer
+ return self
+
+ def __exit__(self, *args):
+ sys.stdout = self.orig_stdout
+ sys.stderr = self.orig_stderr
+ self.stdout = self.stdout_capturer.getvalue()
+ self.stderr = self.stderr_capturer.getvalue()
+
+ if self.stdout:
+ print(self.stdout)
+
+ if self.stderr:
+ print(self.stderr)
+
+ # free up some memory
+ del self.stdout_capturer
+ del self.stderr_capturer
+
+def validate_results(result_mlnet, result_ort):
+
+ if len(result_ort.columns) != len(result_mlnet.columns):
+ raise RuntimeError("ERROR: The ORT output does not contain the same number of columns as ML.NET.")
+ col_tuples = list(zip(result_mlnet.columns[0:],
+ result_ort.columns[0:]))
+ for col_tuple in col_tuples:
+ try:
+ col_mlnet = result_mlnet.loc[:, col_tuple[0]]
+ col_ort = result_ort.loc[:, col_tuple[1]]
+ check_kwargs = {
+ 'check_names': False,
+ 'check_exact': True,
+ 'check_dtype': True,
+ 'check_less_precise': True
+ }
+ pd.testing.assert_series_equal(col_mlnet, col_ort, **check_kwargs)
+ except Exception as e:
+ print(e)
+ raise RuntimeError("ERROR: OnnxRunner result does not match ML.NET result.")
+ return True
+
+def export_to_onnx(estimator, test_case):
+ """
+ Fit and test an estimator and determine
+ if it supports exporting to the ONNX format.
+ """
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+
+ output = None
+ exported = False
+ export_valid = False
+
+ try:
+ dataset = DATASETS.get(test_case)
+
+ result_mlnet = estimator.fit_transform(dataset)
+
+ with CaptureOutputContext() as output:
+ estimator.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ except Exception as e:
+ print(e)
+
+ onnx_file_size = get_file_size(onnx_path)
+ onnx_json_file_size = get_file_size(onnx_json_path)
+
+ if (output and
+ (onnx_file_size != 0) and
+ (onnx_json_file_size != 0) and
+ (not 'cannot save itself as ONNX' in output.stdout) and
+ (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)):
+
+ exported = True
+
+ try:
+ df_tool = DFT(onnx_path)
+ result_ort = df_tool.execute(dataset, [])
+
+ export_valid = validate_results(result_mlnet, result_ort)
+ except Exception as e:
+ print(e)
+
+ os.remove(onnx_path)
+ os.remove(onnx_json_path)
+ return {'exported': exported, 'export_valid': export_valid}
+
+class TestOnnxExport(unittest.TestCase):
+
+ # This method is a static method of the class
+ # because there were pytest fixture related
+ # issues when the method was in the global scope.
+ @staticmethod
+ def generate_test_method(test_case):
+ def method(self):
+ estimator = INSTANCES[test_case]
+
+ result = export_to_onnx(estimator, test_case)
+ assert result['exported']
+ assert result['export_valid']
+
+ return method
+
+for test_case in TEST_CASES:
+ test_name = 'test_%s' % test_case.replace('(', '_').replace(')', '').lower()
+
+ # The following test for far future time point. On Windows it's treated correctly as expected
+ # but for other OS, system_clock::time_point is defined as nanoseconds (64-bit),
+ # which rolls over somewhere around 2260.
+ if test_name in 'DateTimeSplitter_Complex' and (platform.system() == "Darwin" or platform.system() == "Linux"):
+ continue
+
+ method = TestOnnxExport.generate_test_method(test_case)
+ setattr(TestOnnxExport, test_name, method)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
index c96e6988..54967fe2 100644
--- a/src/python/tests_extended/test_export_to_onnx.py
+++ b/src/python/tests_extended/test_export_to_onnx.py
@@ -140,6 +140,10 @@
'RangeFilter',
'Resizer',
'RobustScaler',
+ 'RollingWindow',
+ 'ForecastingPivot',
+ 'LagLeadOperator',
+ 'ShortDrop',
'SkipFilter',
'SsaChangePointDetector',
'SsaForecaster',
@@ -361,8 +365,22 @@
'LogisticRegressionBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
'LogisticRegressionClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
'LpScaler': {'num_cols': 10, 'cols': 0},
- 'MeanVarianceScaler': {'num_cols': 5, 'cols': 0},
- 'MinMaxScaler': {'num_cols': 5, 'cols': 0},
+ 'MeanVarianceScaler': {
+ 'num_cols': 5,
+ 'cols': [('Petal_Length', 'Petal_Length', 'Petal_Length.output'),
+ ('Petal_Width', 'Petal_Width', 'Petal_Width.output'),
+ ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output'),
+ ('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'),
+ ('Setosa', 'Setosa', 'Setosa.output')]
+ },
+ 'MinMaxScaler': {
+ 'num_cols': 5,
+ 'cols': [('Petal_Length', 'Petal_Length', 'Petal_Length.output'),
+ ('Petal_Width', 'Petal_Width', 'Petal_Width.output'),
+ ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output'),
+ ('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'),
+ ('Setosa', 'Setosa', 'Setosa.output')]
+ },
'MutualInformationSelector': {'num_cols': 8, 'cols': 0},
'NGramFeaturizer': {'num_cols': 273, 'cols': 0},
'NaiveBayesClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
diff --git a/src/python/tests_extended/test_tensor_based.py b/src/python/tests_extended/test_tensor_based.py
new file mode 100644
index 00000000..3300a7e8
--- /dev/null
+++ b/src/python/tests_extended/test_tensor_based.py
@@ -0,0 +1,846 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import os
+import platform
+import tempfile
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing.schema import ColumnSelector
+from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter
+from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop
+from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey,
+ DateTimeSplitter, OnnxRunner)
+import onnxruntime as rt
+from data_frame_tool import DataFrameTool as DFT
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+def set_up_onnx_model(estimator, training_data):
+ estimator.fit(training_data)
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ estimator.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ return rt.InferenceSession(onnx_path)
+
+class TestAutoMLTransforms(unittest.TestCase):
+ def test_datetimesplitter(self):
+ training_data = pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600]
+ ))
+
+ cols_to_drop = [
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
+ ]
+
+ dts = DateTimeSplitter(prefix='dt') << 'tokens1'
+ xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data = np.array([1, 2, 3, 157161600]).astype(np.int64).reshape(4,1)
+ result = sess.run(None, {"tokens1":inferencing_data})
+
+ expected_years = np.array([1970, 1970, 1970, 1974]).reshape(4, 1)
+ expected_month = np.array([1, 1, 1, 12]).reshape(4, 1)
+ expected_day = np.array([1, 1, 1, 25]).reshape(4, 1)
+ expected_hour = np.array([0, 0, 0, 0]).reshape(4, 1)
+ expected_minute = np.array([0, 0, 0, 0]).reshape(4, 1)
+ expected_second = np.array([1, 2, 3, 0]).reshape(4, 1)
+ expected_ampm = np.array([0, 0, 0, 0]).reshape(4, 1)
+ expected_holidayname = np.array(["", "", "", ""]).reshape(4, 1)
+
+ np.testing.assert_array_equal(result[1],expected_years)
+ np.testing.assert_array_equal(result[2],expected_month)
+ np.testing.assert_array_equal(result[3],expected_day)
+ np.testing.assert_array_equal(result[4],expected_hour)
+ np.testing.assert_array_equal(result[5],expected_minute)
+ np.testing.assert_array_equal(result[6],expected_second)
+ np.testing.assert_array_equal(result[7],expected_ampm)
+ np.testing.assert_array_equal(result[8],expected_holidayname)
+
+ def test_datetimesplitter_complex(self):
+ training_data = pd.DataFrame(data=dict(
+ tokens1=[217081624, 1751241600, 217081625]
+ ))
+
+ cols_to_drop = [
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
+ ]
+
+ dts = DateTimeSplitter(prefix='dt') << 'tokens1'
+ xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data = np.array([217081624, 1751241600, 217081625]).astype(np.int64).reshape(3,1)
+ result = sess.run(None, {"tokens1": inferencing_data})
+
+ expected_years = np.array([1976, 2025, 1976]).reshape(3, 1)
+ expected_month = np.array([11, 6, 11]).reshape(3, 1)
+ expected_day = np.array([17, 30, 17]).reshape(3, 1)
+ expected_hour = np.array([12, 0, 12]).reshape(3, 1)
+ expected_minute = np.array([27, 0, 27]).reshape(3, 1)
+ expected_second = np.array([4, 0, 5]).reshape(3, 1)
+ expected_ampm = np.array([1, 0, 1]).reshape(3, 1)
+ expected_holidayname = np.array(["", "", ""]).reshape(3, 1)
+
+ np.testing.assert_array_equal(result[1],expected_years)
+ np.testing.assert_array_equal(result[2],expected_month)
+ np.testing.assert_array_equal(result[3],expected_day)
+ np.testing.assert_array_equal(result[4],expected_hour)
+ np.testing.assert_array_equal(result[5],expected_minute)
+ np.testing.assert_array_equal(result[6],expected_second)
+ np.testing.assert_array_equal(result[7],expected_ampm)
+ np.testing.assert_array_equal(result[8],expected_holidayname)
+
+ def test_tokey_simple_float(self):
+
+ training_data = pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.float64})
+
+ xf = ToKeyImputer()
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data = np.array([1, float("NaN"), 4, 7, float("NaN")]).astype(np.float64).reshape(5,1)
+ result = sess.run(None, {"target": inferencing_data})
+
+ expectedData = np.array([1, 1, 4, 7, 1]).astype(np.float64).reshape(5, 1)
+
+ np.testing.assert_array_equal(result[0],expectedData)
+
+ def test_tokey_simple_double(self):
+
+ training_data = pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.double})
+
+ xf = ToKeyImputer()
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data = np.array([1, float("NaN"), 4, 7, float("NaN")]).astype(np.double).reshape(5,1)
+ result = sess.run(None, {"target": inferencing_data})
+
+ expectedData = np.array([1, 1, 4, 7, 1]).astype(np.double).reshape(5, 1)
+
+ np.testing.assert_array_equal(result[0],expectedData)
+
+ def test_tokey_simple_string(self):
+
+ training_data = pd.DataFrame(data=dict(
+ target=["one", "one", "one", "two"]
+ ))
+
+ xf = ToKeyImputer()
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data = np.array(["1", "", "Hello", "", "World"]).reshape(5,1)
+ result = sess.run(None, {"target": inferencing_data})
+
+ expectedData = np.array(["1", "one", "Hello", "one", "World"]).reshape(5, 1)
+
+ np.testing.assert_array_equal(result[0],expectedData)
+
+ def test_tokey_2col_double(self):
+
+ training_data = pd.DataFrame(data=dict(
+ data1=[1.0, 1.0, 1.0, 2.0],
+ data2=[2.0, 2.0, 2.0, 3.0],
+ )).astype({'data1': np.double,
+ 'data1': np.double})
+
+ xf = ToKeyImputer()
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data1 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1)
+ inferencing_data2 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1)
+ result = sess.run(None, {"data1": inferencing_data1, "data2": inferencing_data2})
+
+ expectedData1 = np.array([1.0, 1.0, 4.0, 7.0, 1.0]).astype(np.double).reshape(5, 1)
+ expectedData2 = np.array([1.0, 2.0, 4.0, 7.0, 2.0]).astype(np.double).reshape(5, 1)
+
+ np.testing.assert_array_equal(result[0],expectedData1)
+ np.testing.assert_array_equal(result[1],expectedData2)
+
+ def test_tokey_2col_double_string(self):
+
+ training_data = pd.DataFrame(data=dict(
+ data1=[1.0, 1.0, 1.0, 2.0],
+ data2=["two", "two", "three", "two"],
+ ))
+
+ xf = ToKeyImputer()
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_data1 = np.array([1.0, float("NaN"), 4.0, 7.0, float("NaN")]).astype(np.double).reshape(5,1)
+ inferencing_data2 = np.array(["1", "", "Hello", "", "World"]).reshape(5,1)
+ result = sess.run(None, {"data1": inferencing_data1, "data2": inferencing_data2})
+
+ expectedData1 = np.array([1.0, 1.0, 4.0, 7.0, 1.0]).astype(np.double).reshape(5, 1)
+ expectedData2 = np.array(["1", "two", "Hello", "two", "World"]).reshape(5, 1)
+
+ np.testing.assert_array_equal(result[0],expectedData1)
+ np.testing.assert_array_equal(result[1],expectedData2)
+
+ def test_tostring_numbers_wo_dft(self):
+ training_data = pd.DataFrame(data=dict(
+ f0=[4, 4, -1, 9],
+ f1=[5, 5, 3.1, -0.23],
+ f2=[6, 6.7, np.nan, np.nan]
+ )).astype({'f0': np.int32,
+ 'f1': np.float32,
+ 'f2': np.float64})
+
+ xf = ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'})
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_f0 = np.array([4, 4, -1, 9]).astype(np.int32).reshape(4,1)
+ inferencing_f1 = np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1)
+ inferencing_f2 = np.array([6, 6.7, float("NaN"), float("NaN")]).astype(np.float64).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"f0": inferencing_f0, "f1": inferencing_f1, "f2": inferencing_f2})
+
+ f0_output = np.array([['4'], ['4'], ['-1'], ['9']]).reshape(4, 1)
+ f1_output = np.array([['5.000000'], ['5.000000'], ['3.100000'], ['-0.230000']]).reshape(4, 1)
+ f2_output = np.array([['6.000000'], ['6.700000'], ['NaN'], ['NaN']]).reshape(4, 1)
+
+ np.testing.assert_array_equal(f0_output, result[3])
+ np.testing.assert_array_equal(f1_output, result[4])
+ np.testing.assert_array_equal(f2_output, result[5])
+
+ def test_tostring_other_types_wo_dft(self):
+ training_data = pd.DataFrame(data=dict(
+ f0=[True, False],
+ f1=[123.45, 135453984983490.5473]
+ )).astype({'f0': bool,
+ 'f1': np.double})
+
+ xf = ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1'})
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_f0 = np.array([True, False]).astype(bool).reshape(2,1)
+ inferencing_f1 = np.array([123.45, 135453984983490.5473]).astype(np.double).reshape(2,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"f0": inferencing_f0, "f1": inferencing_f1})
+
+ f0_output = np.array([['True'], ['False']]).reshape(2, 1)
+ #This value, 135453984983490.5473, is changing due to precision and not being able to represent the input exactly.
+ f1_output = np.array([['123.450000'], ['135453984983490.546875']]).reshape(2, 1)
+ np.testing.assert_array_equal(f0_output, result[2])
+ np.testing.assert_array_equal(f1_output, result[3])
+
+ def test_timeseriesimputer_onegrain_twogap(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32})
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1)
+ inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1)
+ inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c})
+
+ expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1)
+ expected_c = np.array([[10], [11], [12], [12], [13], [13], [14]]).astype(np.single).reshape(7, 1)
+ expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1)
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_c, result[2])
+ np.testing.assert_array_equal(expected_isrowimputed, result[3])
+
+ def test_timeseriesimputer_onegrain_twogap_backfill(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32})
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='BackFill',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1)
+ inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1)
+ inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c})
+
+ expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1)
+ expected_c = np.array([[10], [11], [12], [13], [13], [14], [14]]).astype(np.single).reshape(7, 1)
+ expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1)
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_c, result[2])
+ np.testing.assert_array_equal(expected_isrowimputed, result[3])
+
+ def test_timeseriesimputer_onegrain_twogap_backfill(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.double})
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='Median',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1)
+ inferencing_grain = np.array([1970, 1970, 1970, 1970, 1970]).astype(np.int32).reshape(5,1)
+ inferencing_c = np.array([10, 11, 12, 13, 14]).astype(np.double).reshape(5,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c})
+
+ expected_ts = np.array([[1], [2], [3], [4], [5], [6], [7]]).astype(np.single).reshape(7, 1)
+ expected_c = np.array([[10], [11], [12], [12], [13], [12], [14]]).astype(np.single).reshape(7, 1)
+ expected_isrowimputed = np.array([[False], [False], [False], [True], [False], [True], [False]]).astype(np.single).reshape(7, 1)
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_c, result[2])
+ np.testing.assert_array_equal(expected_isrowimputed, result[3])
+
+ def test_timeseriesimputer_twograin_nogap(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[1, 5, 2, 6],
+ grain=[1970, 1971, 1970, 1971],
+ c=[10, 11, 12, 13]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32})
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([1, 5, 2, 6]).astype(np.int64).reshape(4,1)
+ inferencing_grain = np.array([1970, 1971, 1970, 1971]).astype(np.int32).reshape(4,1)
+ inferencing_c = np.array([10, 11, 12, 13]).astype(np.int32).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c})
+
+ expected_ts = np.array([[1], [5], [2], [6]]).astype(np.single).reshape(4, 1)
+ expected_grain = np.array([[1970], [1971], [1970], [1971]]).astype(np.single).reshape(4, 1)
+ expected_c = np.array([[10], [11], [12], [13]]).astype(np.single).reshape(4, 1)
+ expected_isrowimputed = np.array([[False], [False], [False], [False]]).astype(np.single).reshape(4, 1)
+
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_grain, result[1])
+ np.testing.assert_array_equal(expected_c, result[2])
+ np.testing.assert_array_equal(expected_isrowimputed, result[3])
+
+ def test_timeseriesimputer_twograin_twogap(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[0, 5, 1, 6],
+ grain=[1970, 1971, 1970, 1971],
+ c=[10, 11, 12, 13]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32})
+
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([0, 5, 3, 8]).astype(np.int64).reshape(4,1)
+ inferencing_grain = np.array([1970, 1971, 1970, 1971]).astype(np.int32).reshape(4,1)
+ inferencing_c = np.array([10, 11, 12, 13]).astype(np.int32).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, 'c': inferencing_c})
+
+ expected_ts = np.array([[0], [5], [1], [2], [3], [6], [7], [8]]).astype(np.single).reshape(8, 1)
+ expected_grain = np.array([[1970], [1971], [1970], [1970], [1970], [1971], [1971], [1971]]).astype(np.single).reshape(8, 1)
+ expected_c = np.array([[10], [11], [10], [10], [12], [11], [11], [13]]).astype(np.single).reshape(8, 1)
+ expected_isrowimputed = np.array([[False], [False], [True], [True], [False], [True], [True], [False]]).astype(np.single).reshape(8, 1)
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_grain, result[1])
+ np.testing.assert_array_equal(expected_c, result[2])
+ np.testing.assert_array_equal(expected_isrowimputed, result[3])
+
+ def test_timeseriesimputer_onegrain_onegap_two_filtercolumn(self):
+
+ training_data = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5],
+ grain=[1970, 1970, 1970, 1970],
+ c3=[10, 13, 15, 20],
+ c4=[19, 12, 16, 19]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c3': np.int32, 'c4': np.int32})
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ grain_columns=['grain'],
+ filter_columns=['c3', 'c4'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_ts = np.array([1, 2, 3, 5]).astype(np.int64).reshape(4,1)
+ inferencing_grain = np.array([1970, 1970, 1970, 1970]).astype(np.int32).reshape(4,1)
+ inferencing_c3 = np.array([10, 13, 15, 20]).astype(np.int32).reshape(4,1)
+ inferencing_c4 = np.array([19, 12, 16, 19]).astype(np.int32).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"ts": inferencing_ts, "grain": inferencing_grain, "c3": inferencing_c3, "c4": inferencing_c4})
+
+ expected_ts = np.array([[1], [2], [3], [4], [5]]).astype(np.single).reshape(5, 1)
+ expected_c3 = np.array([[10], [13], [15], [15], [20]]).astype(np.single).reshape(5, 1)
+ expected_c4 = np.array([[19], [12], [16], [16], [19]]).astype(np.single).reshape(5, 1)
+ expected_isrowimputed = np.array([[False], [False], [False], [True], [False]]).astype(np.single).reshape(5, 1)
+
+ np.testing.assert_array_equal(expected_ts, result[0])
+ np.testing.assert_array_equal(expected_c3, result[2])
+ np.testing.assert_array_equal(expected_c4, result[3])
+ np.testing.assert_array_equal(expected_isrowimputed, result[4])
+
+ def test_rolling_window_simple_mean_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA })
+
+ expected = np.array([[[float("NaN"), float("NaN")]], [[float("NaN"), 1.0]], [[1.0, 1.5]], [[1.5, 2.5]]]).astype(np.single).reshape(4, 1, 2)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_rolling_window_simple_max_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Max',
+ max_window_size=1,
+ horizon=1)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_rolling_window_simple_min_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Min',
+ max_window_size=1,
+ horizon=1)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA })
+
+ expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_rolling_window_multiple_grains_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 1.0, 2.0],
+ grainA=["one", "one", "two", "two"]
+ ))
+
+ xf = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=1)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "two", "two"]).reshape(4,1)
+ inferencing_colA = np.array([1.0, 2.0, 1.0, 2.0]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[float("NaN")]], [[1.0]], [[float("NaN")]], [[1.0]]]).astype(np.single).reshape(4, 1, 1)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_rolling_window_non_string_grain_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=[True, True, True, True]
+ ))
+
+ xf = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=1)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array([True, True, True, True]).reshape(4,1)
+ inferencing_colA = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[float("NaN")]], [[1.0]], [[2.0]], [[3.0]]]).astype(np.single).reshape(4, 1, 1)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+
+ def test_laglead_lag_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1],
+ horizon=2)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[float("NaN"), float("NaN")], [float("NaN"), float("NaN")]],
+ [[float("NaN"), float("NaN")], [float("NaN"), 1.0]],
+ [[float("NaN"), 1.0], [1.0, 2.0]],
+ [[1.0, 2.0], [2.0, 3.0]]]).astype(np.single).reshape(4, 2, 2)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_laglead_lead_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[1, 2],
+ horizon=2)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[1.0, 2.0], [2.0, 3.0]],
+ [[2.0, 3.0], [3.0, 4.0]],
+ [[3.0, 4.0], [4.0, float("NaN")]],
+ [[4.0, float("NaN")], [float("NaN"), float("NaN")]]]).astype(np.single).reshape(4, 2, 2)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_laglead_complex_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1, 1, 2],
+ horizon=2)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ expected = np.array([[[float("NaN"), float("NaN")], [float("NaN"), float("NaN")], [1.0, 2.0], [2.0, 3.0]],
+ [[float("NaN"), float("NaN")], [float("NaN"), 1.0], [2.0, 3.0], [3.0, 4.0]],
+ [[float("NaN"), 1.0], [1.0, 2.0], [3.0, 4.0], [4.0, float("NaN")]],
+ [[1.0, 2.0], [2.0, 3.0], [4.0, float("NaN")], [float("NaN"), float("NaN")]]]).astype(np.single).reshape(4, 4, 2)
+
+ np.testing.assert_array_equal(expected, result[2])
+
+ def test_short_drop_wo_dft(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "two"]
+ ))
+
+ xf = ShortDrop(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ min_rows=2)
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "two"]).reshape(4,1)
+ inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"grainA": inferencing_grainA, "colA": inferencing_colA})
+
+ colA_expected = np.array([[1.0], [2.0], [3.0]]).reshape(3, 1)
+ grainA_expected = np.array([["one"], ["one"], ["one"]]).reshape(3, 1)
+ np.testing.assert_array_equal(colA_expected, result[0])
+ np.testing.assert_array_equal(grainA_expected, result[1])
+
+ def test_integration_rollwin_pivot(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 3.0, 5.0, 7.0],
+ grainA=['1970', '1970', '1970', '1970']
+ ))
+
+ xf0 = RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['colA1'])
+
+ xf = Pipeline([xf0, xf1])
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_colA = np.array([1.0, 3.0, 5.0, 7.0]).reshape(4,1)
+ inferencing_grainA = np.array(['1970', '1970', '1970', '1970']).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"colA": inferencing_colA, "grainA": inferencing_grainA})
+
+ expected_colA = np.array([[3.0], [5.0], [5.0], [7.0], [7.0]])
+ expected_grainA = np.array([['1970'], ['1970'], ['1970'], ['1970'], ['1970']])
+ expected_output = np.array([[1.0], [1.0], [2.0], [2.0], [4.0]])
+ np.testing.assert_array_equal(expected_colA, result[0])
+ np.testing.assert_array_equal(expected_grainA, result[1])
+ np.testing.assert_array_equal(expected_output, result[3])
+
+ def test_integration_laglead_pivot(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ ))
+
+ xf0 = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1],
+ horizon=2)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['colA1'])
+
+ xf = Pipeline([xf0, xf1])
+
+ sess = set_up_onnx_model(xf, training_data)
+
+ inferencing_grainA = np.array(["one", "one", "one", "one"]).reshape(4,1)
+ inferencing_colA = np.array([1, 2, 3, 4]).astype(np.double).reshape(4,1)
+
+ # Run your inference session with your model and your data
+ result = sess.run(None, {"colA": inferencing_colA, "grainA": inferencing_grainA})
+
+ expected_colA = np.array([[3.0], [4.0], [4.0]])
+ expected_grainA = np.array([['one'], ['one'], ['one']])
+ expected_output_lag2 = np.array([[1.0], [1.0], [2.0]])
+ expected_output_lag1 = np.array([[2.0], [2.0], [3.0]])
+ np.testing.assert_array_equal(expected_colA, result[0])
+ np.testing.assert_array_equal(expected_grainA, result[1])
+ np.testing.assert_array_equal(expected_output_lag2, result[3])
+ np.testing.assert_array_equal(expected_output_lag1, result[4])
+
+ def test_pivot_one_matrix(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0],
+ grainA=["one"]
+ ))
+
+ xf0 = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1, 1],
+ horizon=4)
+ binarydata = xf0.fit_transform(training_data, as_binary_data_stream=True)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['colA1'])
+
+ sess = set_up_onnx_model(xf1, binarydata)
+
+ inferencing_grainA = np.array(["one"]).reshape(1,1)
+ inferencing_colA = np.array([1]).astype(np.double).reshape(1,1)
+ inferencing_colA1 = np.array([1, 4, 6, float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).astype(np.double).reshape(1,3,4)
+
+ result = sess.run(None, {"grainA": inferencing_grainA,"colA": inferencing_colA, "colA1": inferencing_colA1})
+
+ expectedHorizon = np.array([4]).astype(np.uint32).reshape(1, 1)
+ expectedColA = np.array([1]).astype(np.double).reshape(1, 1)
+ expectedLag1 = np.array([1]).astype(np.double).reshape(1, 1)
+ expectedLead1 = np.array([2]).astype(np.double).reshape(1, 1)
+ expectedLag2 = np.array([3]).astype(np.double).reshape(1, 1)
+
+ np.testing.assert_array_equal(result[0], expectedColA)
+ np.testing.assert_array_equal(result[2], expectedHorizon)
+ np.testing.assert_array_equal(result[3], expectedLag1)
+ np.testing.assert_array_equal(result[4], expectedLead1)
+ np.testing.assert_array_equal(result[5], expectedLag2)
+
+ def test_pivot_two_matrix(self):
+
+ training_data = pd.DataFrame(data=dict(
+ colA=[1.0, 2.0],
+ grainA=["one", "one"],
+ colB=[1.0, 3.0],
+ grainB=["one", "one"]
+ ))
+
+ xf0 = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1, 1],
+ horizon=4)
+
+ xf1 = LagLeadOperator(columns={'colB1': 'colB'},
+ grain_columns=['grainB'],
+ offsets=[-2, -1],
+ horizon=4)
+ xf2 = Pipeline([xf0, xf1])
+ binarydata = xf2.fit_transform(training_data, as_binary_data_stream=True)
+
+ xf3 = ForecastingPivot(columns_to_pivot=['colA1', 'colB1'])
+
+ sess = set_up_onnx_model(xf3, binarydata)
+
+ inferencing_colA = np.array([1,2]).astype(np.double).reshape(2,1)
+ inferencing_colB = np.array([1,2]).astype(np.double).reshape(2,1)
+ inferencing_grainA = np.array(["one", "one"]).reshape(2,1)
+ inferencing_grainB = np.array(["one", "one"]).reshape(2,1)
+ inferencing_colA1 = np.array([1, 6, 3, 9, 2, 4, 5, 8, float("NaN"), float("NaN"), 7, 10,
+ 1, 6, 9, 3, 2, 4, 8, 5, float("NaN"), float("NaN"), 10, 7]).astype(np.double).reshape(2,3,4)
+ inferencing_colB1 = np.array([1, float("NaN"), 5, 6, 2, float("NaN"), 3, 4,
+ 1, float("NaN"), 6, 5, 2, float("NaN"), 4, 3]).astype(np.double).reshape(2,2,4)
+
+ result = sess.run(None, {"colA": inferencing_colA, "colB": inferencing_colB, "grainA": inferencing_grainA,
+ "grainB": inferencing_grainB, "colA1": inferencing_colA1, "colB1": inferencing_colB1})
+
+ expectedColA = np.array([1, 1, 2, 2]).astype(np.double).reshape(4, 1)
+ expectedColB = np.array([1, 1, 2, 2]).astype(np.double).reshape(4, 1)
+ expectedHorizon = np.array([2, 1, 2, 1]).astype(np.uint32).reshape(4, 1)
+ expectedLag1 = np.array([3, 9, 9, 3]).astype(np.double).reshape(4, 1)
+ expectedLead1 = np.array([5, 8, 8, 5]).astype(np.double).reshape(4, 1)
+ expectedLag2 = np.array([7, 10, 10, 7]).astype(np.double).reshape(4, 1)
+ expectedVec2Lag2 = np.array([5, 6, 6, 5]).astype(np.double).reshape(4, 1)
+ expectedVec2Lag5 = np.array([3, 4, 4, 3]).astype(np.double).reshape(4, 1)
+
+ np.testing.assert_array_equal(result[0], expectedColA)
+ np.testing.assert_array_equal(result[2], expectedColB)
+ np.testing.assert_array_equal(result[4], expectedHorizon)
+ np.testing.assert_array_equal(result[5], expectedLag1)
+ np.testing.assert_array_equal(result[6], expectedLead1)
+ np.testing.assert_array_equal(result[7], expectedLag2)
+ np.testing.assert_array_equal(result[8], expectedVec2Lag2)
+ np.testing.assert_array_equal(result[9], expectedVec2Lag5)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tests_extended/test_tensor_invalid_input.py b/src/python/tests_extended/test_tensor_invalid_input.py
new file mode 100644
index 00000000..d015b107
--- /dev/null
+++ b/src/python/tests_extended/test_tensor_invalid_input.py
@@ -0,0 +1,408 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import os
+import sys
+import io
+import platform
+import tempfile
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing.schema import ColumnSelector
+from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter
+from scipy.sparse import csr_matrix
+from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop
+from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey,
+ DateTimeSplitter, OnnxRunner)
+import onnxruntime as rt
+from data_frame_tool import DataFrameTool as DFT
+
+TEST_CASES_FOR_INVALID_INPUT = {
+ 'DateTimeSplitter_Bad_Input_Data',
+ 'DateTimeSplitter_Bad_Input_Type',
+ 'DateTimeSplitter_Bad_Input_Shape',
+ 'ToKey_Bad_Input_Type',
+ 'ToKey_Bad_Input_Shape',
+ 'ToString_Bad_Input_Type',
+ 'ToString_Bad_Input_Shape',
+ 'TimeSeriesImputer_Bad_Input_Data',
+ 'TimeSeriesImputer_Bad_Input_Type',
+ 'TimeSeriesImputer_Bad_Input_Shape',
+ 'RollingWindow_Bad_Input_Type',
+ 'RollingWindow_Bad_Input_Shape',
+ 'LagLead_Bad_Input_Type',
+ 'LagLead_Bad_Input_Shape',
+ 'ShortDrop_Bad_Input_Type',
+ 'ShortDrop_Bad_Input_Shape',
+ 'ShortDrop_Drop_All'
+}
+
+INSTANCES_FOR_INVALID_INPUT = {
+ 'DateTimeSplitter_Bad_Input_Data': Pipeline([
+ DateTimeSplitter(prefix='dt') << 'tokens1',
+ ColumnSelector(drop_columns=[
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName'
+ ])
+ ]),
+ 'DateTimeSplitter_Bad_Input_Type': Pipeline([
+ DateTimeSplitter(prefix='dt') << 'tokens1',
+ ColumnSelector(drop_columns=[
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName'
+ ])
+ ]),
+ 'DateTimeSplitter_Bad_Input_Shape': Pipeline([
+ DateTimeSplitter(prefix='dt') << 'tokens1',
+ ColumnSelector(drop_columns=[
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff','dtHolidayName'
+ ])
+ ]),
+ 'ToKey_Bad_Input_Type': ToKeyImputer(),
+ 'ToKey_Bad_Input_Shape': ToKeyImputer(),
+ 'ToString_Bad_Input_Type': ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'}),
+ 'ToString_Bad_Input_Shape': ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'}),
+ 'TimeSeriesImputer_Bad_Input_Data': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_Bad_Input_Type': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'TimeSeriesImputer_Bad_Input_Shape': TimeSeriesImputer(time_series_column='ts',
+ filter_columns=['c'],
+ grain_columns=['grain'],
+ impute_mode='ForwardFill',
+ filter_mode='Include'),
+ 'RollingWindow_Bad_Input_Type': RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2),
+ 'RollingWindow_Bad_Input_Shape': RollingWindow(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ window_calculation='Mean',
+ max_window_size=2,
+ horizon=2),
+ 'LagLead_Bad_Input_Type': LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-1],
+ horizon=1),
+ 'LagLead_Bad_Input_Shape': LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-1],
+ horizon=1),
+ 'ShortDrop_Bad_Input_Type': ShortDrop(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ min_rows=2),
+ 'ShortDrop_Bad_Input_Shape': ShortDrop(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ min_rows=2),
+ 'ShortDrop_Drop_All': ShortDrop(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ min_rows=15)
+}
+
+TRAINING_DATASETS_FOR_INVALID_INPUT = {
+ 'DateTimeSplitter_Bad_Input_Data': pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600]
+ )),
+ 'DateTimeSplitter_Bad_Input_Type': pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600]
+ )),
+ 'DateTimeSplitter_Bad_Input_Shape': pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600]
+ )),
+ 'ToKey_Bad_Input_Type': pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.float64}),
+ 'ToKey_Bad_Input_Shape': pd.DataFrame(data=dict(
+ target=[1.0, 1.0, 1.0, 2.0]
+ )).astype({'target': np.float64}),
+ 'ToString_Bad_Input_Type': pd.DataFrame(data=dict(
+ f0= [4, 4, -1, 9],
+ f1= [5, 5, 3.1, -0.23],
+ f2= [6, 6.7, np.nan, np.nan]
+ )),
+ 'ToString_Bad_Input_Shape': pd.DataFrame(data=dict(
+ f0= [4, 4, -1, 9],
+ f1= [5, 5, 3.1, -0.23],
+ f2= [6, 6.7, np.nan, np.nan]
+ )),
+ 'TimeSeriesImputer_Bad_Input_Data': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'TimeSeriesImputer_Bad_Input_Type': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'TimeSeriesImputer_Bad_Input_Shape': pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5, 7],
+ grain=[1970, 1970, 1970, 1970, 1970],
+ c=[10, 11, 12, 13, 14]
+ )).astype({'ts': np.int64, 'grain': np.int32, 'c': np.int32}),
+ 'RollingWindow_Bad_Input_Type': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'RollingWindow_Bad_Input_Shape': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'LagLead_Bad_Input_Type':pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'LagLead_Bad_Input_Shape': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "one"]
+ )),
+ 'ShortDrop_Bad_Input_Type': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "two"]
+ )),
+ 'ShortDrop_Bad_Input_Shape': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "two"]
+ )),
+ 'ShortDrop_Drop_All': pd.DataFrame(data=dict(
+ colA=[1.0, 2.0, 3.0, 4.0],
+ grainA=["one", "one", "one", "two"]
+ ))
+}
+
+INFERENCE_DATASETS_FOR_INVALID_INPUT = {
+ 'DateTimeSplitter_Bad_Input_Data': {"tokens1": np.array([1, 2, 3, -3]).reshape(4,1)},
+ 'DateTimeSplitter_Bad_Input_Type': {"tokens1": np.array([1, 2, 3, "3"]).reshape(4,1)},
+ 'DateTimeSplitter_Bad_Input_Shape': {"tokens1": np.array([[1, 2, 3, 3], [1, 2, 3, 4]]).reshape(4,2)},
+ 'ToKey_Bad_Input_Type': {"target": np.array([1, float("NaN"), "", 7, float("NaN")]).reshape(5,1)},
+ 'ToKey_Bad_Input_Shape': {"target": np.array([[1, float("NaN")], [1, float("NaN")]]).reshape(2,2)},
+ 'ToString_Bad_Input_Type': {"f0": np.array([4, 4, -1, 9]).astype(np.int32).reshape(4,1),
+ "f1": np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1),
+ "f2": np.array([6, "6.7", float("NaN"), float("NaN")]).reshape(4,1)},
+ 'ToString_Bad_Input_Shape': {"f0": np.array([[4, 4, -1, 9],[4, 4, -1, 9]]).astype(np.int32).reshape(4,2),
+ "f1": np.array([5, 5, 3.1, -0.23]).astype(np.float32).reshape(4,1),
+ "f2": np.array([6, 6.7, float("NaN"), float("NaN")]).reshape(4,1)},
+ 'TimeSeriesImputer_Bad_Input_Data': {"ts": np.array([1, 2, 3, -5, 7]).astype(np.int64).reshape(5,1),
+ "grain": np.array([1970, 1970, 1970, 1970, 1970]).reshape(5,1),
+ "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)},
+ 'TimeSeriesImputer_Bad_Input_Type': {"ts": np.array([1, 2, 3, 5, 7]).astype(np.int64).reshape(5,1),
+ "grain": np.array([1970, "1970", 1970, 1970, 1970]).reshape(5,1),
+ "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)},
+ 'TimeSeriesImputer_Bad_Input_Shape': {"ts": np.array([[1, 2, 3, 5, 7], [1, 2, 3, 5, 7]]).astype(np.int64).reshape(5,2),
+ "grain": np.array([1970, 1970, 1970, 1970, 1970]).reshape(5,1),
+ "c": np.array([10, 11, 12, 13, 14]).astype(np.int32).reshape(5,1)},
+ 'RollingWindow_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1),
+ "colA": np.array([1.0, 2.0, 3.0, "4.0"]).reshape(4,1)},
+ 'RollingWindow_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1),
+ "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)},
+ 'LagLead_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1),
+ "colA": np.array([1, 2, 3, "4"]).reshape(4,1)},
+ 'LagLead_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "one"]).reshape(4,1),
+ "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)},
+ 'ShortDrop_Bad_Input_Type': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1),
+ "colA": np.array([1, 2, 3, "4"]).reshape(4,1)},
+ 'ShortDrop_Bad_Input_Shape': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1),
+ "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)},
+ 'ShortDrop_Drop_All': {"grainA": np.array(["one", "one", "one", "two"]).reshape(4,1),
+ "colA": np.array([[1.0, 2.0, 3.0, 4.0],[1.0, 2.0, 3.0, 4.0]]).reshape(4,2)}
+}
+
+def get_file_size(file_path):
+ file_size = 0
+ try:
+ file_size = os.path.getsize(file_path)
+ except:
+ pass
+ return file_size
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+class CaptureOutputContext():
+ """
+ Context which can be used for
+ capturing stdout and stderr.
+ """
+ def __enter__(self):
+ self.orig_stdout = sys.stdout
+ self.orig_stderr = sys.stderr
+ self.stdout_capturer = io.StringIO()
+ self.stderr_capturer = io.StringIO()
+ sys.stdout = self.stdout_capturer
+ sys.stderr = self.stderr_capturer
+ return self
+
+ def __exit__(self, *args):
+ sys.stdout = self.orig_stdout
+ sys.stderr = self.orig_stderr
+ self.stdout = self.stdout_capturer.getvalue()
+ self.stderr = self.stderr_capturer.getvalue()
+
+ if self.stdout:
+ print(self.stdout)
+
+ if self.stderr:
+ print(self.stderr)
+
+ # free up some memory
+ del self.stdout_capturer
+ del self.stderr_capturer
+
+def validate_bad_input(self, estimator, test_case):
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+
+ exported = False
+ throw_expected_error = False
+ try:
+ dataset = TRAINING_DATASETS_FOR_INVALID_INPUT.get(test_case)
+
+ estimator.fit(dataset)
+
+ with CaptureOutputContext() as output:
+ estimator.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ except Exception as e:
+ print(e)
+
+ onnx_file_size = get_file_size(onnx_path)
+ onnx_json_file_size = get_file_size(onnx_json_path)
+
+ if (output and
+ (onnx_file_size != 0) and
+ (onnx_json_file_size != 0) and
+ (not 'cannot save itself as ONNX' in output.stdout) and
+ (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)):
+
+ exported = True
+
+ sess = rt.InferenceSession(onnx_path)
+
+ with self.assertRaisesRegex(Exception, "ONNXRuntimeError"):
+ invalid_data = INFERENCE_DATASETS_FOR_INVALID_INPUT.get(test_case)
+ pred = sess.run(None, invalid_data)
+
+ throw_expected_error = True
+
+ os.remove(onnx_path)
+ os.remove(onnx_json_path)
+ return {'exported': exported, 'throw_expected_error': throw_expected_error}
+
+class TestOnnxExport(unittest.TestCase):
+ # This method is a static method of the class
+ # because there were pytest fixture related
+ # issues when the method was in the global scope.
+ @staticmethod
+ def generate_test_method_for_bad(test_case):
+ def method(self):
+ estimator = INSTANCES_FOR_INVALID_INPUT[test_case]
+
+ result = validate_bad_input(self, estimator, test_case)
+ assert result['exported']
+ assert result['throw_expected_error']
+
+ return method
+
+ def test_pivot_bad_input_type(self):
+
+ df = pd.DataFrame(data=dict(
+ colA=[1.0],
+ grainA=["one"]
+ ))
+
+ xf0 = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1, 1],
+ horizon=4)
+ binarydata = xf0.fit_transform(df, as_binary_data_stream=True)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['colA1'])
+
+ xf1.fit(binarydata)
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf1.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ sess = rt.InferenceSession(onnx_path)
+
+ with self.assertRaisesRegex(Exception, "ONNXRuntimeError"):
+ grainA = np.array(["one"]).reshape(1,1)
+ colA = np.array([1]).astype(np.double).reshape(1,1)
+ colA1 = np.array([1, 4, "6", float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).reshape(1,3,4)
+ pred = sess.run(None, {"grainA":grainA,"colA":colA, "colA1":colA1 })
+
+ def test_pivot_bad_shape(self):
+
+ df = pd.DataFrame(data=dict(
+ colA=[1.0],
+ grainA=["one"]
+ ))
+
+ xf0 = LagLeadOperator(columns={'colA1': 'colA'},
+ grain_columns=['grainA'],
+ offsets=[-2, -1, 1],
+ horizon=4)
+ binarydata = xf0.fit_transform(df, as_binary_data_stream=True)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['colA1'])
+
+ xf1.fit(binarydata)
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf1.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ sess = rt.InferenceSession(onnx_path)
+
+ with self.assertRaisesRegex(Exception, "ONNXRuntimeError"):
+ grainA = np.array(["one"]).reshape(1,1)
+ colA = np.array([1]).astype(np.double).reshape(1,1)
+ colA1 = np.array([1, 4, 6, float("NaN"), 2, 5, float("NaN"), float("NaN"), 3, float("NaN"), float("NaN"), 7]).reshape(1,2,6)
+ pred = sess.run(None, {"grainA":grainA,"colA":colA, "colA1":colA1 })
+
+
+
+for test_case_invalid_input in TEST_CASES_FOR_INVALID_INPUT:
+ test_name = 'test_%s' % test_case_invalid_input.replace('(', '_').replace(')', '').lower()
+
+ # The following test for negative timepoints. On Windows it throws as expected.
+ # On Mac and Linux negative timepoints are a valid input.
+ if test_name in 'test_datetimesplitter_bad_input_data' and (platform.system() == "Darwin" or platform.system() == "Linux"):
+ continue
+
+ method = TestOnnxExport.generate_test_method_for_bad(test_case_invalid_input)
+ setattr(TestOnnxExport, test_name, method)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tests_extended/test_timeseries_automl.py b/src/python/tests_extended/test_timeseries_automl.py
new file mode 100644
index 00000000..a6f6d73c
--- /dev/null
+++ b/src/python/tests_extended/test_timeseries_automl.py
@@ -0,0 +1,203 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import os
+import platform
+import tempfile
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing import ToString, ToKeyImputer, DateTimeSplitter
+from nimbusml.preprocessing.schema import ColumnSelector
+from nimbusml.timeseries import TimeSeriesImputer, LagLeadOperator, RollingWindow, ForecastingPivot, ShortDrop
+from data_frame_tool import DataFrameTool as DFT
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+class TestAutoMLTransforms(unittest.TestCase):
+
+ def test_tostring(self):
+ data={'f0': [4, 4, -1, 9],
+ 'f1': [5, 5, 3.1, -0.23],
+ 'f2': [6, 6.7, np.nan, np.nan]}
+ data = pd.DataFrame(data).astype({'f0': np.int32,
+ 'f1': np.float32,
+ 'f2': np.float64})
+
+ xf = ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'})
+ xf.fit(data)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_tokeyimputer(self):
+ text_df = pd.DataFrame(
+ data=dict(
+ text=[
+ "cat",
+ "dog",
+ "fish",
+ "orange",
+ "cat orange",
+ "dog",
+ "fish",
+ None,
+ "spider"]))
+
+ xf = ToKeyImputer() << 'text'
+ xf.fit(text_df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_datetimesplitter(self):
+ df = pd.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600],
+ tokens2=[10, 11, 12, 13]
+ ))
+
+ cols_to_drop = [
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
+ ]
+
+ dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
+ xf = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+ xf.fit(df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_timeseriesimputer(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5],
+ grain=[1970, 1970, 1970, 1970],
+ c3=[10, 13, 15, 20],
+ c4=[19, 12, 16, 19]
+ ))
+
+ xf = TimeSeriesImputer(time_series_column='ts',
+ grain_columns=['grain'],
+ filter_columns=['c3', 'c4'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+ xf.fit(df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_shortdrop(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ xf = ShortDrop(grain_columns=['grain'], min_rows=4) << 'ts'
+ xf.fit(df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_rolling_window(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ xf = RollingWindow(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=2)
+ xf.fit(df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_pivot(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ xf0 = RollingWindow(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ window_calculation='Mean',
+ max_window_size=1,
+ horizon=1)
+
+ xf1 = ForecastingPivot(columns_to_pivot=['ts_r'])
+
+ xf = Pipeline([xf0, xf1])
+
+ xf.fit(df)
+
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+ def test_lag(self):
+
+ df = pd.DataFrame(data=dict(
+ ts=[1.0, 3.0, 5.0, 7.0],
+ grain=['1970', '1970', '1970', '1970'],
+ ))
+
+ xf = LagLeadOperator(columns={'ts_r': 'ts'},
+ grain_columns=['grain'],
+ offsets=[-1, 1],
+ horizon=1)
+
+ xf.fit(df)
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+ xf.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json
index fd7f7950..48947d1a 100644
--- a/src/python/tools/manifest.json
+++ b/src/python/tools/manifest.json
@@ -19237,6 +19237,66 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.ForecastingPivot",
+ "Desc": "Pivots the input colums and drops any rows with N/A",
+ "FriendlyName": "ForecastingPivot",
+ "ShortName": "fpivot",
+ "Inputs": [
+ {
+ "Name": "ColumnsToPivot",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of columns to pivot",
+ "Aliases": [
+ "cols"
+ ],
+ "Required": true,
+ "SortOrder": 0.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "HorizonColumnName",
+ "Type": "String",
+ "Desc": "Name of the horizon column generated.",
+ "Aliases": [
+ "hor"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": "Horizon"
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.GlobalContrastNormalizer",
"Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.",
@@ -20466,6 +20526,122 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.LagLeadOperator",
+ "Desc": "Uses the offset list with the horizon to create lags and leads",
+ "FriendlyName": "LagLeadOperator",
+ "ShortName": "LagLead",
+ "Inputs": [
+ {
+ "Name": "GrainColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of grain columns",
+ "Aliases": [
+ "grains"
+ ],
+ "Required": true,
+ "SortOrder": 0.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Horizon",
+ "Type": "UInt",
+ "Desc": "Maximum horizon value",
+ "Aliases": [
+ "hor"
+ ],
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "Offsets",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "Int"
+ },
+ "Desc": "Lag and Lead offset to use. A negative number is a lag, positive is a lead",
+ "Aliases": [
+ "off"
+ ],
+ "Required": true,
+ "SortOrder": 3.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.LightLda",
"Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.",
@@ -22840,6 +23016,151 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.RollingWindow",
+ "Desc": "Performs a calculation over a rolling timeseries window",
+ "FriendlyName": "Rolling Window Featurizer",
+ "ShortName": "RollingWindow",
+ "Inputs": [
+ {
+ "Name": "GrainColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of grain columns",
+ "Aliases": [
+ "grains"
+ ],
+ "Required": true,
+ "SortOrder": 0.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Horizon",
+ "Type": "UInt",
+ "Desc": "Maximum horizon value",
+ "Aliases": [
+ "hor"
+ ],
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "MaxWindowSize",
+ "Type": "UInt",
+ "Desc": "Maximum window size",
+ "Aliases": [
+ "maxsize"
+ ],
+ "Required": true,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "MinWindowSize",
+ "Type": "UInt",
+ "Desc": "Minimum window size",
+ "Aliases": [
+ "minsize"
+ ],
+ "Required": true,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": 1
+ },
+ {
+ "Name": "WindowCalculation",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Mean",
+ "Min",
+ "Max"
+ ]
+ },
+ "Desc": "What window calculation to use",
+ "Aliases": [
+ "calc"
+ ],
+ "Required": true,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "0"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.RowRangeFilter",
"Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.",
@@ -23279,6 +23600,66 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.ShortDrop",
+ "Desc": "Drops rows if there aren't enough values per grain.",
+ "FriendlyName": "ShortDrop",
+ "ShortName": "sgd",
+ "Inputs": [
+ {
+ "Name": "GrainColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of grain columns",
+ "Aliases": [
+ "grains"
+ ],
+ "Required": true,
+ "SortOrder": 0.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "MinRows",
+ "Type": "UInt",
+ "Desc": "Minimum number of values required",
+ "Aliases": [
+ "minr"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.TensorFlowScorer",
"Desc": "Transforms the data using the TensorFlow model.",
diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json
index a70489ee..761ea70c 100644
--- a/src/python/tools/manifest_diff.json
+++ b/src/python/tools/manifest_diff.json
@@ -226,6 +226,30 @@
"XGBoost.TrainRegression"
],
"EntryPoints": [
+ {
+ "Name": "Transforms.RollingWindow",
+ "NewName": "RollingWindow",
+ "Module": "timeseries",
+ "Type": "Transform"
+ },
+ {
+ "Name": "Transforms.LagLeadOperator",
+ "NewName": "LagLeadOperator",
+ "Module": "timeseries",
+ "Type": "Transform"
+ },
+ {
+ "Name": "Transforms.ShortDrop",
+ "NewName": "ShortDrop",
+ "Module": "timeseries",
+ "Type": "Transform"
+ },
+ {
+ "Name": "Transforms.ForecastingPivot",
+ "NewName": "ForecastingPivot",
+ "Module": "timeseries",
+ "Type": "Transform"
+ },
{
"Name": "Trainers.AveragedPerceptronBinaryClassifier",
"NewName": "AveragedPerceptronBinaryClassifier",