diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
index 866348c1b3..8e53da2143 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
@@ -57,7 +57,7 @@ public static void Example()
IDataView data = loader.Load(dataFilePath);
- var split = ml.BinaryClassification.TrainTestSplit(data, testFraction: 0.2);
+ var split = ml.Data.TrainTestSplit(data, testFraction: 0.2);
var pipeline = ml.Transforms.Concatenate("Text", "workclass", "education", "marital-status",
"relationship", "ethnicity", "sex", "native-country")
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
index e9702a8e5a..fd1ff9457a 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
@@ -16,7 +16,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline.
var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
index fb1dfacf50..6c2693ef8f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Define the trainer options.
var options = new AveragedPerceptronTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
index 9164eaaca0..52fe41cc4b 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
@@ -15,7 +15,7 @@ public static void Example()
// Download and featurize the dataset.
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
// Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
index 91d1586869..9c856d1455 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
@@ -15,7 +15,7 @@ public static void Example()
// Download and featurize the dataset.
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
// Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
index 9cb46e1677..edb38b5cc5 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
@@ -15,7 +15,7 @@ public static void Example()
// Download and featurize the dataset.
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
// Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
index 8f15d0c138..12ff762d14 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
@@ -15,7 +15,7 @@ public static void Example()
// Download and featurize the dataset.
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
// Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
index 1d9e485b31..a5785495f3 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
@@ -14,7 +14,7 @@ public static void Example()
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1);
// Create the Estimator.
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
index 7b0e21fed9..1323d765cf 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
@@ -15,7 +15,7 @@ public static void Example()
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1);
// Create the pipeline with LightGbm Estimator using advanced options.
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
index c8cb69676d..4c19c5bd71 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
@@ -19,7 +19,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Define the trainer options.
var options = new SdcaBinaryTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
index bbebc47d9a..3175695d5f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
@@ -18,7 +18,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline.
var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
index 4c4097ad38..018fa0524e 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
@@ -20,7 +20,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline.
var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescentNonCalibrated();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
index 4703fa87c7..ff09800688 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
@@ -21,7 +21,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline.
var pipeline = mlContext.BinaryClassification
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
index 1d74daba1b..1d7d4d15b3 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
@@ -19,7 +19,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Define the trainer options.
var options = new SgdBinaryTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
index c3f1e1508e..10496e4b25 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
@@ -17,7 +17,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline.
var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(labelColumnName: "IsOver50K", numberOfIterations: 25);
var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
index de4f4ff386..99ea7f1460 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
@@ -17,7 +17,7 @@ public static void Example()
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
// Leave out 10% of data for testing.
- var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create data training pipeline
var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(
new ML.Trainers.SymbolicStochasticGradientDescentClassificationTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
index 399ddda16f..e7604ee5bf 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
@@ -37,7 +37,7 @@ public static void Example()
// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5);
// Train the model.
var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
index 5af98034bf..cebdf2f4bd 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
@@ -48,7 +48,7 @@ public static void Example()
// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5);
// Train the model.
var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
index d99d3368d7..b436fe502e 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
@@ -34,7 +34,7 @@ public static void Example()
// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Train the model.
var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
index 600d642365..453617cc51 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
@@ -45,7 +45,7 @@ public static void Example()
// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Train the model.
var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
index 2e616dfee1..d4fca60320 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
@@ -15,8 +15,8 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
- // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
- var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+ // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
+ var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
index 101d08ec13..e0bc29a0af 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
@@ -16,8 +16,8 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
- // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
- var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+ // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
+ var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
index d23aebf141..816b980c9f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
@@ -23,7 +23,7 @@ public static void Example()
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ...
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ...
- var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Create the estimator, here we only need LightGbm trainer
// as data is already processed in a form consumable by the trainer.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
index 260c546e7f..86c37af30b 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
@@ -25,7 +25,7 @@ public static void Example()
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ...
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ...
- var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Create a pipeline with LightGbm estimator with advanced options.
// Here we only need LightGbm trainer as data is already processed
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
index 7fe79c9a43..204322cec7 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
@@ -39,7 +39,7 @@ public static void Example()
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10
- var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
// Create the estimator, here we only need OrdinaryLeastSquares trainer
// as data is already processed in a form consumable by the trainer
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
index cbbd09342e..ccfa66aeb7 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
@@ -40,7 +40,7 @@ public static void Example()
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10
- var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
// Create the estimator, here we only need OrdinaryLeastSquares trainer
// as data is already processed in a form consumable by the trainer
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
index 7ce8122f78..c97c0e7be1 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
@@ -19,7 +19,7 @@ public static void Example()
// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Train the model.
var pipeline = mlContext.Regression.Trainers.StochasticDualCoordinateAscent();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
index fc743d8f41..08677d21c6 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
// Create trainer options.
var options = new SdcaRegressionTrainer.Options
diff --git a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
index 1f471917c2..34b380d086 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
@@ -55,7 +55,7 @@ public static void AveragedPerceptronBinaryClassification()
// Load the data, and leave 10% out, so we can use them for testing
var data = loader.Load(dataFilePath);
- var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create the Estimator
var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
index 0480ec5015..5caa465b18 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
@@ -57,7 +57,7 @@ public static void FastTreeBinaryClassification()
// Loader the data, and leave 10% out, so we can use them for testing
var data = loader.Load(dataFilePath);
- var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create the Estimator
var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
index 5228c356dc..ea2d238a46 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
@@ -57,7 +57,7 @@ public static void LightGbmBinaryClassification()
// Load the data, and leave 10% out, so we can use them for testing
var data = loader.Load(dataFilePath);
- var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create the Estimator
var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
index e6415b9e19..ef4eea908d 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
@@ -53,7 +53,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData()
// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5);
+ var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5);
// Train the model.
var model = pipe.Fit(trainingData);
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
index cab1700636..344bf8bbc3 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
@@ -28,7 +28,7 @@ public static void LightGbmRegression()
// Load the data, and leave 10% out, so we can use them for testing
var data = loader.Load(new MultiFileSource(dataFile));
- var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// The predictor that gets produced out of training
LightGbmRegressionModelParameters pred = null;
diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
index 8150b6400b..8caf039a3d 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
@@ -55,7 +55,7 @@ public static void SdcaBinaryClassification()
// Load the data, and leave 10% out, so we can use them for testing
var data = loader.Load(dataFilePath);
- var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Create the Estimator
var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
index 602a3a5473..28f7e7f185 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
@@ -26,7 +26,7 @@ public static void SdcaRegression()
// Load the data, and leave 10% out, so we can use them for testing
var data = loader.Load(dataFile);
- var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// The predictor that gets produced out of training
LinearRegressionModelParameters pred = null;
diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
index 5692260bab..d9106d6482 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
@@ -19,6 +19,31 @@ public sealed class DataOperationsCatalog : IInternalCatalog
IHostEnvironment IInternalCatalog.Environment => _env;
private readonly IHostEnvironment _env;
+ ///
+ /// A pair of datasets, for the train and test set.
+ ///
+ public struct TrainTestData
+ {
+ ///
+ /// Training set.
+ ///
+ public readonly IDataView TrainSet;
+ ///
+ /// Testing set.
+ ///
+ public readonly IDataView TestSet;
+ ///
+ /// Create pair of datasets.
+ ///
+ /// Training set.
+ /// Testing set.
+ internal TrainTestData(IDataView trainSet, IDataView testSet)
+ {
+ TrainSet = trainSet;
+ TestSet = testSet;
+ }
+ }
+
internal DataOperationsCatalog(IHostEnvironment env)
{
Contracts.AssertValue(env);
@@ -337,5 +362,83 @@ public IDataView TakeRows(IDataView input, long count)
return new SkipTakeFilter(_env, options, input);
}
+
+ ///
+ /// Split the dataset into the train set and test set according to the given fraction.
+ /// Respects the if provided.
+ ///
+ /// The dataset to split.
+ /// The fraction of data to go into the test set.
+ /// Name of a column to use for grouping rows. If two examples share the same value of the ,
+ /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
+ /// If no row grouping will be performed.
+ /// Seed for the random number generator used to select rows for the train-test split.
+ public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null)
+ {
+ _env.CheckValue(data, nameof(data));
+ _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
+ _env.CheckValueOrNull(samplingKeyColumn);
+
+ EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumn, seed);
+
+ var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
+ {
+ Column = samplingKeyColumn,
+ Min = 0,
+ Max = testFraction,
+ Complement = true
+ }, data);
+ var testFilter = new RangeFilter(_env, new RangeFilter.Options()
+ {
+ Column = samplingKeyColumn,
+ Min = 0,
+ Max = testFraction,
+ Complement = false
+ }, data);
+
+ return new TrainTestData(trainFilter, testFilter);
+ }
+
+ ///
+ /// Ensures the provided is valid for , hashing it if necessary, or creates a new column is null.
+ ///
+ internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, uint? seed = null)
+ {
+ // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
+ // build a single hash of it. If it is not, we generate a random number.
+
+ if (samplingKeyColumn == null)
+ {
+ samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
+ data = new GenerateNumberTransform(env, data, samplingKeyColumn, seed);
+ }
+ else
+ {
+ if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
+ throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
+
+ var type = data.Schema[stratCol].Type;
+ if (!RangeFilter.IsValidRangeFilterColumnType(env, type))
+ {
+ // Hash the samplingKeyColumn.
+ // REVIEW: this could currently crash, since Hash only accepts a limited set
+ // of column types. It used to be HashJoin, but we should probably extend Hash
+ // instead of having two hash transformations.
+ var origStratCol = samplingKeyColumn;
+ int tmp;
+ int inc = 0;
+
+ // Generate a new column with the hashed samplingKeyColumn.
+ while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
+ samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
+ HashingEstimator.ColumnOptions columnOptions;
+ if (seed.HasValue)
+ columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value);
+ else
+ columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
+ data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
+ }
+ }
+ }
}
}
diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
index 4f3effa036..79b53bcfbc 100644
--- a/src/Microsoft.ML.Data/TrainCatalog.cs
+++ b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -23,67 +23,6 @@ public abstract class TrainCatalogBase : IInternalCatalog
[BestFriend]
private protected IHostEnvironment Environment { get; }
- ///
- /// A pair of datasets, for the train and test set.
- ///
- public struct TrainTestData
- {
- ///
- /// Training set.
- ///
- public readonly IDataView TrainSet;
- ///
- /// Testing set.
- ///
- public readonly IDataView TestSet;
- ///
- /// Create pair of datasets.
- ///
- /// Training set.
- /// Testing set.
- internal TrainTestData(IDataView trainSet, IDataView testSet)
- {
- TrainSet = trainSet;
- TestSet = testSet;
- }
- }
-
- ///
- /// Split the dataset into the train set and test set according to the given fraction.
- /// Respects the if provided.
- ///
- /// The dataset to split.
- /// The fraction of data to go into the test set.
- /// Name of a column to use for grouping rows. If two examples share the same value of the ,
- /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
- /// If no row grouping will be performed.
- /// Seed for the random number generator used to select rows for the train-test split.
- public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null)
- {
- Environment.CheckValue(data, nameof(data));
- Environment.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
- Environment.CheckValueOrNull(samplingKeyColumn);
-
- EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed);
-
- var trainFilter = new RangeFilter(Environment, new RangeFilter.Options()
- {
- Column = samplingKeyColumn,
- Min = 0,
- Max = testFraction,
- Complement = true
- }, data);
- var testFilter = new RangeFilter(Environment, new RangeFilter.Options()
- {
- Column = samplingKeyColumn,
- Min = 0,
- Max = testFraction,
- Complement = false
- }, data);
-
- return new TrainTestData(trainFilter, testFilter);
- }
-
///
/// Results for specific cross-validation fold.
///
@@ -155,7 +94,7 @@ private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEs
Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
Environment.CheckValueOrNull(samplingKeyColumn);
- EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed);
+ DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);
Func foldFunction =
fold =>
@@ -198,48 +137,6 @@ private protected TrainCatalogBase(IHostEnvironment env, string registrationName
Environment = env;
}
- ///
- /// Ensures the provided is valid for , hashing it if necessary, or creates a new column is null.
- ///
- private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint? seed = null)
- {
- // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
- // build a single hash of it. If it is not, we generate a random number.
-
- if (samplingKeyColumn == null)
- {
- samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
- data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed);
- }
- else
- {
- if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
- throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
-
- var type = data.Schema[stratCol].Type;
- if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type))
- {
- // Hash the samplingKeyColumn.
- // REVIEW: this could currently crash, since Hash only accepts a limited set
- // of column types. It used to be HashJoin, but we should probably extend Hash
- // instead of having two hash transformations.
- var origStratCol = samplingKeyColumn;
- int tmp;
- int inc = 0;
-
- // Generate a new column with the hashed samplingKeyColumn.
- while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
- samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
- HashingEstimator.ColumnOptions columnOptions;
- if (seed.HasValue)
- columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value);
- else
- columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
- data = new HashingEstimator(Environment, columnOptions).Fit(data).Transform(data);
- }
- }
- }
-
///
/// Subclasses of will provide little "extension method" hookable objects
/// (for example, something like ). User code will only
diff --git a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
index 577a4f22e4..4b09545d78 100644
--- a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
+++ b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
@@ -29,7 +29,7 @@ public static class TrainingStaticExtensions
/// If the is not provided, the random numbers generated to create it, will use this seed as value.
/// And if it is not provided, the default value will be used.
/// A pair of datasets, for the train and test set.
- public static (DataView trainSet, DataView testSet) TrainTestSplit(this TrainCatalogBase catalog,
+ public static (DataView trainSet, DataView testSet) TrainTestSplit(this DataOperationsCatalog catalog,
DataView data, double testFraction = 0.1, Func stratificationColumn = null, uint? seed = null)
{
var env = StaticPipeUtils.GetEnvironment(data);
diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs
index 9bad405571..b0d1fee62a 100644
--- a/test/Microsoft.ML.Functional.Tests/Prediction.cs
+++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs
@@ -25,7 +25,7 @@ public void ReconfigurablePrediction()
var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
.Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
- var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2);
+ var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.2);
// Create a pipeline to train on the housing data
var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs
index 49a5db6693..86157e2626 100644
--- a/test/Microsoft.ML.Functional.Tests/Validation.cs
+++ b/test/Microsoft.ML.Functional.Tests/Validation.cs
@@ -65,7 +65,7 @@ public void TrainWithValidationSet()
var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
// Create the train and validation set.
- var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2);
+ var dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.2);
var trainData = dataSplit.TrainSet;
var validData = dataSplit.TestSet;
diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
index 0c6fb3c510..383be10382 100644
--- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
+++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
@@ -721,13 +721,11 @@ public void TrainTestSplit()
var dataPath = GetDataPath(TestDatasets.iris.trainFilename);
var dataSource = new MultiFileSource(dataPath);
- var ctx = new BinaryClassificationCatalog(env);
-
var reader = TextLoaderStatic.CreateLoader(env,
c => (label: c.LoadFloat(0), features: c.LoadFloat(1, 4)));
var data = reader.Load(dataSource);
- var (train, test) = ctx.TrainTestSplit(data, 0.5);
+ var (train, test) = env.Data.TrainTestSplit(data, 0.5);
// Just make sure that the train is about the same size as the test set.
var trainCount = train.GetColumn(r => r.label).Count();
@@ -736,7 +734,7 @@ public void TrainTestSplit()
Assert.InRange(trainCount * 1.0 / testCount, 0.8, 1.2);
// Now stratify by label. Silly thing to do.
- (train, test) = ctx.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label);
+ (train, test) = env.Data.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label);
var trainLabels = train.GetColumn(r => r.label).Distinct();
var testLabels = test.GetColumn(r => r.label).Distinct();
Assert.True(trainLabels.Count() > 0);
diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs
index 26d2741040..d1b48877b4 100644
--- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs
+++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs
@@ -1232,7 +1232,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData()
// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
- var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5);
+ var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5);
// Train the model.
var model = pipe.Fit(trainingData);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
index 8e7e97145c..03a36249cb 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
@@ -601,7 +601,7 @@ private void CrossValidationOn(string dataPath)
Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features)));
// Split the data 90:10 into train and test sets, train and evaluate.
- var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+ var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Train the model.
var model = pipeline.Fit(trainData);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
index 77c6145849..bf7ab04d51 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
@@ -426,7 +426,7 @@ private void CrossValidationOn(string dataPath)
.Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());
// Split the data 90:10 into train and test sets, train and evaluate.
- var split = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+ var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
// Train the model.
var model = pipeline.Fit(split.TrainSet);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
index 5428dd8dba..c316aca93e 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
@@ -312,8 +312,8 @@ public void TestTrainTestSplit()
// Let's test what train test properly works with seed.
// In order to do that, let's split same dataset, but in one case we will use default seed value,
// and in other case we set seed to be specific value.
- var simpleSplit = mlContext.BinaryClassification.TrainTestSplit(input);
- var splitWithSeed = mlContext.BinaryClassification.TrainTestSplit(input, seed: 10);
+ var simpleSplit = mlContext.Data.TrainTestSplit(input);
+ var splitWithSeed = mlContext.Data.TrainTestSplit(input, seed: 10);
// Since test fraction is 0.1, it's much faster to compare test subsets of split.
var simpleTestWorkClass = getWorkclass(simpleSplit.TestSet);
@@ -325,7 +325,7 @@ public void TestTrainTestSplit()
// Now let's do same thing but with presence of stratificationColumn.
// Rows with same values in this stratificationColumn should end up in same subset (train or test).
// So let's break dataset by "Workclass" column.
- var stratSplit = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn: "Workclass");
+ var stratSplit = mlContext.Data.TrainTestSplit(input, samplingKeyColumn: "Workclass");
var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet);
var stratTestWorkClass = getWorkclass(stratSplit.TestSet);
// Let's get unique values for "Workclass" column from train subset.
@@ -337,7 +337,7 @@ public void TestTrainTestSplit()
// Let's do same thing, but this time we will choose different seed.
// Stratification column should still break dataset properly without same values in both subsets.
- var stratSeed = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000);
+ var stratSeed = mlContext.Data.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000);
var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet);
var stratTestWithSeedWorkClass = getWorkclass(stratSeed.TestSet);
// Let's get unique values for "Workclass" column from train subset.