Skip to content

Debugging hanging AutoFitImageClassificationTrainTest #4893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f4a910e
Free Tensor objects in finally statement
mstfbl Mar 27, 2020
4f8475c
Update RunnerUtil.cs
mstfbl Mar 27, 2020
f7e1337
Re-enable AutoFitImageClassificationTrainTest after fix
mstfbl Mar 30, 2020
83ad312
Added IDisposable support to ModelContainer & corrected name of model…
mstfbl Apr 7, 2020
b366fbe
Corrected name of modelContainer in used cases
mstfbl Apr 7, 2020
cb22b21
Clean up Tensor objects through finalizer/destructor of ImageClassifi…
mstfbl Apr 9, 2020
eefa76f
Dispose ExperimentResult objects at the end
mstfbl Apr 9, 2020
45681b4
Dispose only Tensor objects in models
mstfbl Apr 10, 2020
fbd3fd9
Don't free BestModel models
mstfbl Apr 12, 2020
2816ced
Merge remote-tracking branch 'upstream/master' into AutoFitImageClass…
mstfbl Apr 12, 2020
7dad242
Throw Exception if model is trying to be accessed after disposal
mstfbl Apr 14, 2020
1488d0c
Initialize IsModelDisposed inside constructors
mstfbl Apr 14, 2020
78bba9c
Model always written to disk, no longer stored in memory, simplify mo…
mstfbl Apr 16, 2020
bf84823
Model always written to disk, no longer stored in memory, simplify mo…
mstfbl Apr 16, 2020
15b6135
Merge branch 'AutoFitImageClassificationTrainTest-memoryFix' of https…
mstfbl Apr 16, 2020
087c0d5
Update ModelContainer.cs
mstfbl Apr 16, 2020
d633c5c
Run AutoFitImageClassificationTrainTest 100 times with latest update
mstfbl Apr 16, 2020
bb4a8b2
Restart build
mstfbl Apr 16, 2020
43f5f2a
Test latest changes
mstfbl Apr 16, 2020
0350890
Merge branch 'AutoFitTests-Debugging' of https://github.com/mstfbl/ma…
mstfbl Apr 16, 2020
f9adbef
Update .vsts-dotnet-ci.yml
mstfbl Apr 16, 2020
de8a567
Dispose of models using "using", and free model after saving to disk
mstfbl Apr 16, 2020
eec4c42
Dispose model in RunnerUtil.cs
mstfbl Apr 16, 2020
dadc173
Test directly disposing models when models can still be in memory
mstfbl Apr 16, 2020
e0864a3
Test directly disposing models when models can still be in memory - 2
mstfbl Apr 16, 2020
675ee13
Add test case for .DisposeRunDetails and get memory info
mstfbl Apr 16, 2020
146bc40
get memory info
mstfbl Apr 16, 2020
1e3e985
Get memory info in Windows and UNIX builds
mstfbl Apr 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions .vsts-dotnet-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,31 @@ jobs:
_config_short: RI
_includeBenchmarkData: true
_targetFramework: netcoreapp3.1
innerLoop: true
innerLoop: false
pool:
name: NetCorePublic-Pool
queue: BuildPool.Ubuntu.1604.Amd64.Open
runSpecific: true

- template: /build/ci/job-template.yml
parameters:
name: Ubuntu_x64_NetCoreApp21
buildScript: ./build.sh
container: UbuntuContainer
innerLoop: true
innerLoop: false
pool:
name: NetCorePublic-Pool
queue: BuildPool.Ubuntu.1604.Amd64.Open
runSpecific: true

- template: /build/ci/job-template.yml
parameters:
name: MacOS_x64_NetCoreApp21
buildScript: ./build.sh
innerLoop: true
innerLoop: false
pool:
name: Hosted macOS
runSpecific: true

- template: /build/ci/job-template.yml
parameters:
Expand All @@ -65,19 +68,21 @@ jobs:
_config_short: RI
_includeBenchmarkData: true
_targetFramework: netcoreapp3.1
innerLoop: true
innerLoop: false
vsTestConfiguration: "/Framework:.NETCoreApp,Version=v3.0"
pool:
name: Hosted VS2017
runSpecific: true

- template: /build/ci/job-template.yml
parameters:
name: Windows_x64_NetCoreApp21
buildScript: build.cmd
innerLoop: true
innerLoop: false
vsTestConfiguration: "/Framework:.NETCoreApp,Version=v2.1"
pool:
name: Hosted VS2017
runSpecific: true

- template: /build/ci/job-template.yml
parameters:
Expand All @@ -94,17 +99,8 @@ jobs:
_config_short: RFX
_includeBenchmarkData: false
_targetFramework: win-x64
innerLoop: true
innerLoop: false
vsTestConfiguration: "/Framework:.NETCoreApp,Version=v4.0"
pool:
name: Hosted VS2017

- template: /build/ci/job-template.yml
parameters:
name: Windows_x86_NetCoreApp21
architecture: x86
buildScript: build.cmd
innerLoop: true
vsTestConfiguration: "/Framework:.NETCoreApp,Version=v2.1"
pool:
name: Hosted VS2017
runSpecific: true
2 changes: 1 addition & 1 deletion build/ci/job-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ parameters:
codeCoverage: false
nightlyBuild: false
innerLoop: false
runSpecific: false
runSpecific: true
container: ''
useVSTestTask: false

Expand Down
3 changes: 3 additions & 0 deletions init-tools.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ if NOT [%AGENT_ID%] == [] (
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps" /f /v DumpFolder /t REG_SZ /d "%~dp0CrashDumps"
)

:: Temp - get total RAM size
powershell -Command "(systeminfo | Select-String 'Total Physical Memory:').ToString().Split(':')[1].Trim()"

:: install procdump.exe to take process dump when test crashes, hangs or fails
echo Installing procdump.exe
powershell -Command "Invoke-WebRequest https://download.sysinternals.com/files/Procdump.zip -UseBasicParsing -outfile procdump.zip | Out-Null"
Expand Down
3 changes: 3 additions & 0 deletions init-tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ if [ -e "$__BUILD_TOOLS_SEMAPHORE" ]; then
return #return instead of exit because this script is inlined in other scripts which we don't want to exit
fi

# Temp - Get total RAM size
cat /proc/meminfo

if [ -e "$__TOOLRUNTIME_DIR" ]; then rm -rf -- "$__TOOLRUNTIME_DIR"; fi

if [ -d "${DotNetBuildToolsDir:-}" ]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.ML.Data;

Expand All @@ -11,7 +12,7 @@ namespace Microsoft.ML.AutoML
/// Result of an AutoML experiment that includes cross validation details.
/// </summary>
/// <typeparam name="TMetrics">Metrics type for the experiment (like <see cref="BinaryClassificationMetrics"/>).</typeparam>
public class CrossValidationExperimentResult<TMetrics>
public class CrossValidationExperimentResult<TMetrics> : IDisposable
{
/// <summary>
/// Details of the cross validation runs in this experiment.
Expand All @@ -36,5 +37,44 @@ internal CrossValidationExperimentResult(IEnumerable<CrossValidationRunDetail<TM
RunDetails = runDetails;
BestRun = bestRun;
}

#region IDisposable Support
private bool _disposed;
private bool _disposedRunDetails;

/// <summary>
/// Releases unmanaged Tensor objects in models stored in RunDetail and BestRun instances
/// </summary>
/// <remarks>
/// Invocation of Dispose() is necessary to clean up remaining C library Tensor objects and
/// avoid a memory leak
/// </remarks>
public void Dispose()
{
if (_disposed)
return;
if (!_disposedRunDetails)
DisposeRunDetails();
(BestRun as IDisposable)?.Dispose();
_disposed = true;
}

/// <summary>
/// Releases unmanaged Tensor objects in models stored in RunDetail instances
/// </summary>
/// <remarks>
/// Invocation of DisposeRunDetails() is necessary to clean up remaining C library
/// Tensor objects and avoid a memory leak. Compared to Dispose(), DisposeRunDetails()
/// only disposes of RunDetails so that the best determined model in BestRun can have
/// a different lifetime than models in other experimental runs.
/// </remarks>
public void DisposeRunDetails()
{
if (_disposedRunDetails || _disposed)
return;
(RunDetails as IDisposable)?.Dispose();
_disposedRunDetails = true;
}
#endregion
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.ML.Data;

Expand All @@ -11,7 +12,7 @@ namespace Microsoft.ML.AutoML
/// Result of an AutoML experiment.
/// </summary>
/// <typeparam name="TMetrics">Metrics type for the experiment (like <see cref="BinaryClassificationMetrics"/>).</typeparam>
public class ExperimentResult<TMetrics>
public class ExperimentResult<TMetrics> : IDisposable
{
/// <summary>
/// Details of the runs in this experiment.
Expand All @@ -36,5 +37,44 @@ internal ExperimentResult(IEnumerable<RunDetail<TMetrics>> runDetails,
RunDetails = runDetails;
BestRun = bestRun;
}

#region IDisposable Support
private bool _disposed;
private bool _disposedRunDetails;

/// <summary>
/// Releases unmanaged Tensor objects in models stored in RunDetail and BestRun instances
/// </summary>
/// <remarks>
/// Invocation of Dispose() is necessary to clean up remaining C library Tensor objects and
/// avoid a memory leak
/// </remarks>
public void Dispose()
{
if (_disposed)
return;
if (!_disposedRunDetails)
DisposeRunDetails();
(BestRun as IDisposable)?.Dispose();
_disposed = true;
}

/// <summary>
/// Releases unmanaged Tensor objects in models stored in RunDetail instances
/// </summary>
/// <remarks>
/// Invocation of DisposeRunDetails() is necessary to clean up remaining C library
/// Tensor objects and avoid a memory leak. Compared to Dispose(), DisposeRunDetails()
/// only disposes of RunDetails so that the best determined model in BestRun can have
/// a different lifetime than models in other experimental runs.
/// </remarks>
public void DisposeRunDetails()
{
if (_disposedRunDetails || _disposed)
return;
(RunDetails as IDisposable)?.Dispose();
_disposedRunDetails = true;
}
#endregion
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public CrossValRunner(MLContext context,
var modelFileInfo = RunnerUtil.GetModelFileInfo(modelDirectory, iterationNum, i + 1);
var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainDatasets[i], _validDatasets[i],
_labelColumn, _metricsAgent, _preprocessorTransforms?[i], modelFileInfo, _modelInputSchema, _logger);
trainResults.Add(new SuggestedPipelineTrainResult<TMetrics>(trainResult.model, trainResult.metrics, trainResult.exception, trainResult.score));
trainResults.Add(new SuggestedPipelineTrainResult<TMetrics>(trainResult.modelContainer, trainResult.metrics, trainResult.exception, trainResult.score));
}

var avgScore = CalcAverageScore(trainResults.Select(r => r.Score));
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.AutoML/Experiment/Runners/RunnerUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace Microsoft.ML.AutoML
{
internal static class RunnerUtil
{
public static (ModelContainer model, TMetrics metrics, Exception exception, double score)
public static (ModelContainer modelContainer, TMetrics metrics, Exception exception, double score)
TrainAndScorePipeline<TMetrics>(MLContext context,
SuggestedPipeline pipeline,
IDataView trainData,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public TrainValidateRunner(MLContext context,
trainResult.score,
trainResult.exception == null,
trainResult.metrics,
trainResult.model,
trainResult.modelContainer,
trainResult.exception);
var runDetail = suggestedPipelineRunDetail.ToIterationResult(_preFeaturizer);
return (suggestedPipelineRunDetail, runDetail);
Expand Down
46 changes: 36 additions & 10 deletions test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
using static Microsoft.ML.DataOperationsCatalog;
using Microsoft.ML.TestFramework;
using Xunit.Abstractions;
using Microsoft.ML.TestFrameworkCommon.Attributes;
using System.Diagnostics;

namespace Microsoft.ML.AutoML.Test
{
Expand All @@ -29,7 +31,7 @@ public void AutoFitBinaryTest()
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
var result = context.Auto()
using var result = context.Auto()
.CreateBinaryClassificationExperiment(0)
.Execute(trainData, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel });
Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70);
Expand All @@ -38,26 +40,50 @@ public void AutoFitBinaryTest()
Assert.NotNull(result.BestRun.TrainerName);
}

[Fact]
public void AutoFitBinaryTestDisposeNonBestModels()
{
var context = new MLContext(1);
var dataPath = DatasetUtil.GetUciAdultDataset();
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
// Result will have more than 1 model with a total experiment time of 15 seconds
using var result = context.Auto()
.CreateBinaryClassificationExperiment(15)
.Execute(trainData, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel });
Assert.True(result.RunDetails.Count() > 1);
// Dispose of models that did not yield the best run
result.DisposeRunDetails();
Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70);
Assert.NotNull(result.BestRun.Estimator);
Assert.NotNull(result.BestRun.Model);
Assert.NotNull(result.BestRun.TrainerName);
}

[Fact]
public void AutoFitMultiTest()
{
var context = new MLContext(42);
var columnInference = context.Auto().InferColumns(DatasetUtil.TrivialMulticlassDatasetPath, DatasetUtil.TrivialMulticlassDatasetLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(DatasetUtil.TrivialMulticlassDatasetPath);
var result = context.Auto()
using var result = context.Auto()
.CreateMulticlassClassificationExperiment(0)
.Execute(trainData, 5, DatasetUtil.TrivialMulticlassDatasetLabel);
Assert.True(result.BestRun.Results.First().ValidationMetrics.MicroAccuracy >= 0.7);
var scoredData = result.BestRun.Results.First().Model.Transform(trainData);
Assert.Equal(NumberDataViewType.Single, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type);
}

[TensorFlowFact]
//Skipping test temporarily. This test will be re-enabled once the cause of failures has been determined
[Trait("Category", "SkipInCI")]
public void AutoFitImageClassificationTrainTest()
[Theory, TestCategory("RunSpecificTest"), IterationData(100)]
public void AutoFitImageClassificationTrainTest(int iterations)
{
Console.WriteLine(String.Format("AutoFitImageClassificationTrainTest Iteration: {0}", iterations));
Process proc = Process.GetCurrentProcess();
Console.WriteLine(String.Format("Iteration {0} - Total memory usage in GBs (proc): {1}", iterations, proc.PrivateMemorySize64 / (1024.0 * 1024.0 * 1024.0)));
proc.Dispose();
Console.WriteLine(String.Format("Iteration {0} - Total memory usage in GBs (GC): {1}", iterations, GC.GetTotalMemory(false) / (1024.0 * 1024.0 * 1024.0)));
var context = new MLContext(seed: 1);
var datasetPath = DatasetUtil.GetFlowersDataset();
var columnInference = context.Auto().InferColumns(datasetPath, "Label");
Expand All @@ -67,7 +93,7 @@ public void AutoFitImageClassificationTrainTest()
TrainTestData trainTestData = context.Data.TrainTestSplit(trainData, testFraction: 0.2, seed: 1);
IDataView trainDataset = SplitUtil.DropAllColumnsExcept(context, trainTestData.TrainSet, originalColumnNames);
IDataView testDataset = SplitUtil.DropAllColumnsExcept(context, trainTestData.TestSet, originalColumnNames);
var result = context.Auto()
using var result = context.Auto()
.CreateMulticlassClassificationExperiment(0)
.Execute(trainDataset, testDataset, columnInference.ColumnInformation);

Expand All @@ -89,7 +115,7 @@ public void AutoFitImageClassification()
var columnInference = context.Auto().InferColumns(datasetPath, "Label");
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(datasetPath);
var result = context.Auto()
using var result = context.Auto()
.CreateMulticlassClassificationExperiment(0)
.Execute(trainData, columnInference.ColumnInformation);

Expand All @@ -113,7 +139,7 @@ public void AutoFitRegressionTest()
var trainData = textLoader.Load(dataPath);
var validationData = context.Data.TakeRows(trainData, 20);
trainData = context.Data.SkipRows(trainData, 20);
var result = context.Auto()
using var result = context.Auto()
.CreateRegressionExperiment(0)
.Execute(trainData, validationData,
new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel });
Expand All @@ -137,7 +163,7 @@ public void AutoFitRecommendationTest()
var testDataView = reader.Load(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename)));

// STEP 2: Run AutoML experiment
ExperimentResult<RegressionMetrics> experimentResult = mlContext.Auto()
using ExperimentResult<RegressionMetrics> experimentResult = mlContext.Auto()
.CreateRecommendationExperiment(5)
.Execute(trainDataView, testDataView,
new ColumnInformation()
Expand Down