Skip to content

Commit 3053f3d

Browse files
authored
Fix column purpose for PipelineSweeperMacro (#461)
* Adding arguments to PipelineSweep Macro * updating the unit tests * taking care of review comments; adding validations for Label, Weight, GroupId and Name columns * taking care of some review comments * some code cleanup * addressing PR comments * API changes to use RoleMappedData * taking care of review comments * using pipeline.UniqueId * taking care of review comments. update ColumnPurpose 'Group' so it is consistent with Role 'Group'
1 parent 669f4fa commit 3053f3d

File tree

13 files changed

+401
-37
lines changed

13 files changed

+401
-37
lines changed

src/Microsoft.ML.PipelineInference/AutoInference.cs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ public sealed class AutoMlMlState : IMlState
208208
private TransformInference.SuggestedTransform[] _availableTransforms;
209209
private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners;
210210
private DependencyMap _dependencyMapping;
211+
private RoleMappedData _dataRoles;
211212
public IPipelineOptimizer AutoMlEngine { get; set; }
212213
public PipelinePattern[] BatchCandidates { get; set; }
213214
public SupportedMetric Metric { get; }
@@ -313,7 +314,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows)
313314
var currentBatchSize = batchSize;
314315
if (_terminator is IterationTerminator itr)
315316
currentBatchSize = Math.Min(itr.RemainingIterations(_history), batchSize);
316-
var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize);
317+
var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize, _dataRoles);
317318

318319
// Break if no candidates returned, means no valid pipeline available.
319320
if (candidates.Length == 0)
@@ -370,19 +371,21 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T
370371
TransformInference.SuggestedTransform[] existingTransforms = null)
371372
{
372373
// Infer transforms using experts
373-
var levelTransforms = TransformInference.InferTransforms(_env, data, args);
374+
var levelTransforms = TransformInference.InferTransforms(_env, data, args, _dataRoles);
374375

375376
// Retain only those transforms inferred which were also passed in.
376377
if (existingTransforms != null)
377378
return levelTransforms.Where(t => existingTransforms.Any(t2 => t2.Equals(t))).ToArray();
378379
return levelTransforms;
379380
}
380381

381-
public void InferSearchSpace(int numTransformLevels)
382+
public void InferSearchSpace(int numTransformLevels, RoleMappedData dataRoles = null)
382383
{
383384
var learners = RecipeInference.AllowedLearners(_env, TrainerKind).ToArray();
384385
if (_requestedLearners != null && _requestedLearners.Length > 0)
385386
learners = learners.Where(l => _requestedLearners.Contains(l.LearnerName)).ToArray();
387+
388+
_dataRoles = dataRoles;
386389
ComputeSearchSpace(numTransformLevels, learners, (b, c) => InferAndFilter(b, c));
387390
}
388391

@@ -536,7 +539,21 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates)
536539
var currentBatchSize = numberOfCandidates;
537540
if (_terminator is IterationTerminator itr)
538541
currentBatchSize = Math.Min(itr.RemainingIterations(_history), numberOfCandidates);
539-
BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize);
542+
BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize, _dataRoles);
543+
544+
using (var ch = _host.Start("Suggested Pipeline"))
545+
{
546+
foreach (var pipeline in BatchCandidates)
547+
{
548+
ch.Info($"AutoInference Pipeline Id : {pipeline.UniqueId}");
549+
foreach (var transform in pipeline.Transforms)
550+
{
551+
ch.Info($"AutoInference Transform : {transform.Transform}");
552+
}
553+
ch.Info($"AutoInference Learner : {pipeline.Learner}");
554+
}
555+
}
556+
540557
return BatchCandidates;
541558
}
542559

src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Linq;
8+
using Microsoft.ML.Runtime.Data;
89
using Microsoft.ML.Runtime.EntryPoints;
910
using Microsoft.ML.Runtime.PipelineInference;
1011

@@ -33,9 +34,10 @@ public DefaultsEngine(IHostEnvironment env, Arguments args)
3334
_currentLearnerIndex = 0;
3435
}
3536

36-
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numCandidates)
37+
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numCandidates, RoleMappedData dataRoles)
3738
{
3839
var candidates = new List<PipelinePattern>();
40+
DataRoles = dataRoles;
3941

4042
while (candidates.Count < numCandidates)
4143
{
@@ -53,7 +55,8 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern>
5355

5456
do
5557
{ // Make sure transforms set is valid. Repeat until passes verifier.
56-
pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask), learner, "", Env);
58+
pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask),
59+
learner, "", Env);
5760
valid = PipelineVerifier(pipeline, transformsBitMask);
5861
count++;
5962
} while (!valid && count <= 1000);
@@ -77,7 +80,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(out long transf
7780

7881
// Add final features concat transform.
7982
sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData,
80-
DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms));
83+
DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles));
8184

8285
return sampledTransforms.ToArray();
8386
}

src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference
187187
// cause an error in verification, since it isn't included in the original
188188
// dependency mapping (i.e., its level isn't in the dictionary).
189189
sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData,
190-
DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms));
190+
DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles));
191191
transformsBitMask = mask;
192192

193193
return sampledTransforms.ToArray();
@@ -202,9 +202,10 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume
202202
.Select(t=>AvailableLearners[t.Index]).ToArray();
203203
}
204204

205-
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numCandidates)
205+
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numCandidates, RoleMappedData dataRoles)
206206
{
207207
var prevCandidates = history.ToArray();
208+
DataRoles = dataRoles;
208209

209210
switch (_currentStage)
210211
{
@@ -220,7 +221,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern>
220221
// number of candidates, using second stage logic.
221222
UpdateLearners(GetTopLearners(prevCandidates));
222223
_currentStage++;
223-
return GetNextCandidates(prevCandidates, numCandidates);
224+
return GetNextCandidates(prevCandidates, numCandidates, DataRoles);
224225
}
225226
else
226227
return GetInitialPipelines(prevCandidates, remainingNum);
@@ -252,9 +253,11 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern>
252253
}
253254
}
254255

255-
private PipelinePattern[] GetInitialPipelines(IEnumerable<PipelinePattern> history, int numCandidates) =>
256-
_secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]
257-
.GetNextCandidates(history, numCandidates);
256+
private PipelinePattern[] GetInitialPipelines(IEnumerable<PipelinePattern> history, int numCandidates)
257+
{
258+
var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)];
259+
return engine.GetNextCandidates(history, numCandidates, DataRoles);
260+
}
258261

259262
private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates,
260263
bool defaultHyperParams = false, bool uniformRandomTransforms = false)
@@ -294,8 +297,9 @@ private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandi
294297
do
295298
{ // Make sure transforms set is valid and have not seen pipeline before.
296299
// Repeat until passes or runs out of chances.
297-
pipeline = new PipelinePattern(SampleTransforms(learner, history,
298-
out var transformsBitMask, uniformRandomTransforms), learner, "", Env);
300+
pipeline = new PipelinePattern(
301+
SampleTransforms(learner, history, out var transformsBitMask, uniformRandomTransforms),
302+
learner, "", Env);
299303
hashKey = GetHashKey(transformsBitMask, learner);
300304
valid = PipelineVerifier(pipeline, transformsBitMask) && !VisitedPipelines.Contains(hashKey);
301305
count++;

src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Linq;
8+
using Microsoft.ML.Runtime.Data;
89
using Microsoft.ML.Runtime.EntryPoints;
910
using Microsoft.ML.Runtime.PipelineInference;
1011

@@ -30,8 +31,9 @@ public UniformRandomEngine(IHostEnvironment env)
3031
: base(env, env.Register("UniformRandomEngine(AutoML)"))
3132
{}
3233

33-
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates)
34+
public override PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates, RoleMappedData dataRoles)
3435
{
36+
DataRoles = dataRoles;
3537
return GetRandomPipelines(numberOfCandidates);
3638
}
3739

@@ -66,7 +68,7 @@ private PipelinePattern[] GetRandomPipelines(int numOfPipelines)
6668

6769
// Always include features concat transform
6870
selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData,
69-
DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms));
71+
DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, DataRoles));
7072

7173
// Compute hash key for checking if we've already seen this pipeline.
7274
// However, if we keep missing, don't want to get stuck in infinite loop.

src/Microsoft.ML.PipelineInference/AutoMlUtils.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ public static long TransformsToBitmask(TransformInference.SuggestedTransform[] t
257257
/// (In other words, if there would be nothing for that concatenate transform to do.)
258258
/// </summary>
259259
private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env,
260-
IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset)
260+
IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, RoleMappedData dataRoles)
261261
{
262262
var finalArgs = new TransformInference.Arguments
263263
{
@@ -266,7 +266,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo
266266
ExcludedColumnIndices = excludedColumnIndices
267267
};
268268

269-
var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs);
269+
var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs, dataRoles);
270270

271271
for (int i = 0; i < featuresConcatTransforms.Length; i++)
272272
{
@@ -282,7 +282,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo
282282
/// </summary>
283283
public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView data,
284284
AutoInference.DependencyMap dependencyMapping, TransformInference.SuggestedTransform[] selectedTransforms,
285-
TransformInference.SuggestedTransform[] allTransforms)
285+
TransformInference.SuggestedTransform[] allTransforms, RoleMappedData dataRoles)
286286
{
287287
int level = 1;
288288
int atomicGroupLimit = 0;
@@ -292,7 +292,7 @@ public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHos
292292
atomicGroupLimit = allTransforms.Max(t => t.AtomicGroupId) + 1;
293293
}
294294
var excludedColumnIndices = GetExcludedColumnIndices(selectedTransforms, data, dependencyMapping);
295-
return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit);
295+
return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit, dataRoles);
296296
}
297297

298298
public static IDataView ApplyTransformSet(IHostEnvironment env, IDataView data, TransformInference.SuggestedTransform[] transforms)

src/Microsoft.ML.PipelineInference/InferenceUtils.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ public static Type InferPredictorCategoryType(IDataView data, PurposeInference.C
8383
label.ItemKind == DataKind.TX ||
8484
data.Schema.GetColumnType(label.ColumnIndex).IsKey)
8585
{
86-
if (columns.Any(col => col.Purpose == ColumnPurpose.GroupId))
86+
if (columns.Any(col => col.Purpose == ColumnPurpose.Group))
8787
return typeof(SignatureRankerTrainer);
8888
else
8989
return typeof(SignatureMultiClassClassifierTrainer);
@@ -177,7 +177,7 @@ public enum ColumnPurpose
177177
CategoricalFeature = 4,
178178
TextFeature = 5,
179179
Weight = 6,
180-
GroupId = 7,
180+
Group = 7,
181181
ImagePath = 8
182182
}
183183
}

src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ namespace Microsoft.ML.Runtime.PipelineInference
2121
/// </summary>
2222
public interface IPipelineOptimizer
2323
{
24-
PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates);
24+
PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates, RoleMappedData dataRoles);
2525

2626
void SetSpace(TransformInference.SuggestedTransform[] availableTransforms,
2727
RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners,
@@ -44,6 +44,7 @@ public abstract class PipelineOptimizerBase : IPipelineOptimizer
4444
protected IDataView OriginalData;
4545
protected IDataView FullyTransformedData;
4646
protected AutoInference.DependencyMap DependencyMapping;
47+
protected RoleMappedData DataRoles;
4748
protected readonly IHostEnvironment Env;
4849
protected readonly IHost Host;
4950
protected readonly Dictionary<long, bool> TransformsMaskValidity;
@@ -60,7 +61,7 @@ protected PipelineOptimizerBase(IHostEnvironment env, IHost host)
6061
ProbUtils = new SweeperProbabilityUtils(host);
6162
}
6263

63-
public abstract PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates);
64+
public abstract PipelinePattern[] GetNextCandidates(IEnumerable<PipelinePattern> history, int numberOfCandidates, RoleMappedData dataRoles);
6465

6566
public virtual void SetSpace(TransformInference.SuggestedTransform[] availableTransforms,
6667
RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners,

0 commit comments

Comments
 (0)