@@ -443,10 +443,24 @@ var reader = mlContext.Data.TextReader(ctx => (
443
443
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
444
444
var trainData = reader .Read (trainDataPath );
445
445
446
+ // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
447
+ // several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
448
+ // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
449
+ // a caching step, which provides the same caching function, will be inserted in the considered "learningPipeline."
450
+ var cachedTrainData = trainData .Cache ();
451
+
446
452
// Step two: define the learning pipeline.
447
453
448
454
// We 'start' the pipeline with the output of the reader.
449
455
var learningPipeline = reader .MakeNewEstimator ()
456
+ // We add a step for caching data in memory so that the downstream iterative training
457
+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
458
+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
459
+ // The data accessed in any downstream step will be cached since its first use. In general, you only
460
+ // need to add a caching step before trainable step, because caching is not helpful if the data is
461
+ // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
462
+ // data set.
463
+ .AppendCacheCheckpoint ()
450
464
// Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be
451
465
// between -1 and 1 for all examples)
452
466
.Append (r => (
@@ -486,13 +500,28 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
486
500
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
487
501
var trainData = reader .Read (trainDataPath );
488
502
503
+ // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
504
+ // several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
505
+ // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
506
+ // a caching step, which provides the same caching function, will be inserted in the considered "dynamicPipeline."
507
+ var cachedTrainData = mlContext .Data .Cache (trainData );
508
+
489
509
// Step two: define the learning pipeline.
490
510
491
511
// We 'start' the pipeline with the output of the reader.
492
512
var dynamicPipeline =
493
513
// First 'normalize' the data (rescale to be
494
514
// between -1 and 1 for all examples)
495
515
mlContext .Transforms .Normalize (" FeatureVector" )
516
+ // We add a step for caching data in memory so that the downstream iterative training
517
+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
518
+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
519
+ // The data accessed in any downstream step will be cached since its first use. In general, you only
520
+ // need to add a caching step before trainable step, because caching is not helpful if the data is
521
+ // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
522
+ // data set. Notice that in the upstream Transforms.Normalize step, we only scan through the data
523
+ // once so adding a caching step before it is not helpful.
524
+ .AppendCacheCheckpoint (mlContext )
496
525
// Add the SDCA regression trainer.
497
526
.Append (mlContext .Regression .Trainers .StochasticDualCoordinateAscent (label : " Target" , features : " FeatureVector" ));
498
527
@@ -595,6 +624,13 @@ var learningPipeline = reader.MakeNewEstimator()
595
624
r .Label ,
596
625
// Concatenate all the features together into one column 'Features'.
597
626
Features : r .SepalLength .ConcatWith (r .SepalWidth , r .PetalLength , r .PetalWidth )))
627
+ // We add a step for caching data in memory so that the downstream iterative training
628
+ // algorithm can efficiently scan through the data multiple times. Otherwise, the following
629
+ // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
630
+ // The data accessed in any downstream step will be cached since its first use. In general, you only
631
+ // need to add a caching step before trainable step, because caching is not helpful if the data is
632
+ // only scanned once.
633
+ .AppendCacheCheckpoint ()
598
634
.Append (r => (
599
635
r .Label ,
600
636
// Train the multi-class SDCA model to predict the label using features.
@@ -640,6 +676,8 @@ var dynamicPipeline =
640
676
mlContext .Transforms .Concatenate (" Features" , " SepalLength" , " SepalWidth" , " PetalLength" , " PetalWidth" )
641
677
// Note that the label is text, so it needs to be converted to key.
642
678
.Append (mlContext .Transforms .Categorical .MapValueToKey (" Label" ), TransformerScope .TrainTest )
679
+ // Cache data in moemory for steps after the cache check point stage.
680
+ .AppendCacheCheckpoint (mlContext )
643
681
// Use the multi-class SDCA model to predict the label using features.
644
682
.Append (mlContext .MulticlassClassification .Trainers .StochasticDualCoordinateAscent ())
645
683
// Apply the inverse conversion from 'PredictedLabel' column back to string value.
@@ -741,6 +779,7 @@ var trainData = mlContext.CreateStreamingDataView(churnData);
741
779
742
780
var dynamicLearningPipeline = mlContext .Transforms .Categorical .OneHotEncoding (" DemographicCategory" )
743
781
.Append (mlContext .Transforms .Concatenate (" Features" , " DemographicCategory" , " LastVisits" ))
782
+ .AppendCacheCheckpoint (mlContext ) // FastTree will benefit from caching data in memory.
744
783
.Append (mlContext .BinaryClassification .Trainers .FastTree (" HasChurned" , " Features" , numTrees : 20 ));
745
784
746
785
var dynamicModel = dynamicLearningPipeline .Fit (trainData );
@@ -757,6 +796,7 @@ var staticLearningPipeline = staticData.MakeNewEstimator()
757
796
.Append (r => (
758
797
r .HasChurned ,
759
798
Features : r .DemographicCategory .OneHotEncoding ().ConcatWith (r .LastVisits )))
799
+ .AppendCacheCheckpoint () // FastTree will benefit from caching data in memory.
760
800
.Append (r => mlContext .BinaryClassification .Trainers .FastTree (r .HasChurned , r .Features , numTrees : 20 ));
761
801
762
802
var staticModel = staticLearningPipeline .Fit (staticData );
@@ -813,6 +853,8 @@ var learningPipeline = reader.MakeNewEstimator()
813
853
// When the normalizer is trained, the below delegate is going to be called.
814
854
// We use it to memorize the scales.
815
855
onFit : (scales , offsets ) => normScales = scales )))
856
+ // Cache data used in memory because the subsequently trainer needs to access the data multiple times.
857
+ .AppendCacheCheckpoint ()
816
858
.Append (r => (
817
859
r .Label ,
818
860
// Train the multi-class SDCA model to predict the label using features.
@@ -987,6 +1029,10 @@ var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray();
987
1029
988
1030
// Build several alternative featurization pipelines.
989
1031
var learningPipeline = reader .MakeNewEstimator ()
1032
+ // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
1033
+ // cached in memory at their first uses. This step can be removed if user's machine doesn't
1034
+ // have enough memory.
1035
+ .AppendCacheCheckpoint ()
990
1036
.Append (r => (
991
1037
r .Label ,
992
1038
r .NumericalFeatures ,
@@ -1070,6 +1116,9 @@ var workclasses = transformedData.GetColumn<float[]>(mlContext, "WorkclassOneHot
1070
1116
var fullLearningPipeline = dynamicPipeline
1071
1117
// Concatenate two of the 3 categorical pipelines, and the numeric features.
1072
1118
.Append (mlContext .Transforms .Concatenate (" Features" , " NumericalFeatures" , " CategoricalBag" , " WorkclassOneHotTrimmed" ))
1119
+ // Cache data in memory so that the following trainer will be able to access training examples without
1120
+ // reading them from disk multiple times.
1121
+ .AppendCacheCheckpoint (mlContext )
1073
1122
// Now we're ready to train. We chose our FastTree trainer for this classification task.
1074
1123
.Append (mlContext .BinaryClassification .Trainers .FastTree (numTrees : 50 ));
1075
1124
@@ -1121,6 +1170,10 @@ var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray();
1121
1170
1122
1171
// Apply various kinds of text operations supported by ML.NET.
1123
1172
var learningPipeline = reader .MakeNewEstimator ()
1173
+ // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
1174
+ // cached in memory at their first uses. This step can be removed if user's machine doesn't
1175
+ // have enough memory.
1176
+ .AppendCacheCheckpoint ()
1124
1177
.Append (r => (
1125
1178
// One-stop shop to run the full text featurization.
1126
1179
TextFeatures : r .Message .FeaturizeText (),
@@ -1243,6 +1296,9 @@ var learningPipeline = reader.MakeNewEstimator()
1243
1296
Label : r .Label .ToKey (),
1244
1297
// Concatenate all the features together into one column 'Features'.
1245
1298
Features : r .SepalLength .ConcatWith (r .SepalWidth , r .PetalLength , r .PetalWidth )))
1299
+ // Add a step for caching data in memory so that the downstream iterative training
1300
+ // algorithm can efficiently scan through the data multiple times.
1301
+ .AppendCacheCheckpoint ()
1246
1302
.Append (r => (
1247
1303
r .Label ,
1248
1304
// Train the multi-class SDCA model to predict the label using features.
@@ -1298,6 +1354,10 @@ var dynamicPipeline =
1298
1354
mlContext .Transforms .Concatenate (" Features" , " SepalLength" , " SepalWidth" , " PetalLength" , " PetalWidth" )
1299
1355
// Note that the label is text, so it needs to be converted to key.
1300
1356
.Append (mlContext .Transforms .Conversions .MapValueToKey (" Label" ), TransformerScope .TrainTest )
1357
+ // Cache data in memory so that SDCA trainer will be able to randomly access training examples without
1358
+ // reading data from disk multiple times. Data will be cached at its first use in any downstream step.
1359
+ // Notice that unused part in the data may not be cached.
1360
+ .AppendCacheCheckpoint (mlContext )
1301
1361
// Use the multi-class SDCA model to predict the label using features.
1302
1362
.Append (mlContext .MulticlassClassification .Trainers .StochasticDualCoordinateAscent ());
1303
1363
@@ -1439,6 +1499,7 @@ public static ITransformer TrainModel(MLContext mlContext, IDataView trainData)
1439
1499
Action < InputRow , OutputRow > mapping = (input , output ) => output .Label = input .Income > 50000 ;
1440
1500
// Construct the learning pipeline.
1441
1501
var estimator = mlContext .Transforms .CustomMapping (mapping , null )
1502
+ .AppendCacheCheckpoint (mlContext )
1442
1503
.Append (mlContext .BinaryClassification .Trainers .FastTree (label : " Label" ));
1443
1504
1444
1505
return estimator .Fit (trainData );
@@ -1480,8 +1541,12 @@ public class CustomMappings
1480
1541
var estimator = mlContext .Transforms .CustomMapping <InputRow , OutputRow >(CustomMappings .IncomeMapping , nameof (CustomMappings .IncomeMapping ))
1481
1542
.Append (mlContext .BinaryClassification .Trainers .FastTree (label : " Label" ));
1482
1543
1544
+ // If memory is enough, we can cache the data in-memory to avoid reading them from file
1545
+ // when it will be accessed multiple times.
1546
+ var cachedTrainData = mlContext .Data .Cache (trainData );
1547
+
1483
1548
// Train the model.
1484
- var model = estimator .Fit (trainData );
1549
+ var model = estimator .Fit (cachedTrainData );
1485
1550
1486
1551
// Save the model.
1487
1552
using (var fs = File .Create (modelPath ))
0 commit comments