-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Adding Debugging Scenario tests for V1 APIs #2937
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c312ed5
ff593be
2006be7
82b5729
1a10d27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Functional.Tests.Datasets; | ||
using Microsoft.ML.RunTests; | ||
using Microsoft.ML.TestFramework; | ||
using Microsoft.ML.Trainers; | ||
using Microsoft.ML.Transforms.Text; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace Microsoft.ML.Functional.Tests | ||
{ | ||
public class Debugging : BaseTestClass | ||
{ | ||
public Debugging(ITestOutputHelper output) : base(output) | ||
{ | ||
} | ||
|
||
/// <summary> | ||
/// Debugging: The individual pipeline steps can be inspected to see what is happening to | ||
/// data as it flows through. | ||
/// </summary> | ||
/// <remarks> | ||
/// It should, possibly through the debugger, be not such a pain to actually | ||
/// see what is happening to your data when you apply this or that transform. For example, if I | ||
/// were to have the text "Help I'm a bug!" I should be able to see the steps where it is | ||
/// normalized to "help i'm a bug" then tokenized into ["help", "i'm", "a", "bug"] then | ||
/// mapped into term numbers [203, 25, 3, 511] then projected into the sparse | ||
/// float vector {3:1, 25:1, 203:1, 511:1}, etc. etc. | ||
/// </remarks> | ||
[Fact] | ||
void InspectIntermediatePipelineSteps() | ||
{ | ||
var mlContext = new MLContext(seed: 1); | ||
|
||
var data = mlContext.Data.LoadFromEnumerable<TweetSentiment>( | ||
new TweetSentiment[] | ||
{ | ||
new TweetSentiment { Sentiment = true, SentimentText = "I love ML.NET." }, | ||
new TweetSentiment { Sentiment = true, SentimentText = "I love TLC." }, | ||
new TweetSentiment { Sentiment = false, SentimentText = "I dislike fika." } | ||
}); | ||
|
||
// create a training pipeline. | ||
var pipeline = mlContext.Transforms.Text.FeaturizeText( | ||
"Features", | ||
new TextFeaturizingEstimator.Options | ||
{ | ||
KeepPunctuations = false, | ||
OutputTokens = true, | ||
CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 }, | ||
WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1}, | ||
Norm = TextFeaturizingEstimator.NormFunction.None | ||
}, | ||
"SentimentText"); | ||
|
||
// Fit the pipeline to the data. | ||
var model = pipeline.Fit(data); | ||
|
||
// Transform the data. | ||
var transformedData = model.Transform(data); | ||
|
||
var preview = transformedData.Preview(); | ||
|
||
// Verify that columns can be inspected. | ||
// Validate the tokens column. | ||
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
where is this magic string coming from? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This magic string is the tokenized column. It takes the name you give it, and returns In reply to: 265701391 [](ancestors = 265701391) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
var expectedTokens = new string[3][] | ||
{ | ||
new string[] {"i", "love", "mlnet"}, | ||
new string[] {"i", "love", "tlc"}, | ||
new string[] {"i", "dislike", "fika"}, | ||
}; | ||
int i = 0; | ||
foreach (var rowTokens in tokensColumn) | ||
Assert.Equal(expectedTokens[i++], rowTokens); | ||
|
||
// Validate the Features column. | ||
var featuresColumn = transformedData.GetColumn<float[]>(transformedData.Schema["Features"]); | ||
var expectedFeatures = new float[3][] | ||
{ | ||
new float[6] { 1, 1, 1, 0, 0 ,0 }, | ||
new float[6] { 1, 1, 0, 1, 0, 0 }, | ||
new float[6] { 1, 0, 0, 0, 1, 1 } | ||
}; | ||
i = 0; | ||
foreach (var rowFeatures in featuresColumn) | ||
Assert.Equal(expectedFeatures[i++], rowFeatures); | ||
} | ||
|
||
/// <summary> | ||
/// Debugging: The schema of the pipeline can be inspected. | ||
/// </summary> | ||
[Fact] | ||
public void InspectPipelineSchema() | ||
{ | ||
var mlContext = new MLContext(seed: 1); | ||
|
||
// Get the dataset. | ||
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); | ||
|
||
// Define a pipeline | ||
var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) | ||
.Append(mlContext.Transforms.Normalize()) | ||
.AppendCacheCheckpoint(mlContext) | ||
.Append(mlContext.Regression.Trainers.Sdca( | ||
new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); | ||
|
||
// Fit the pipeline to the data. | ||
var model = pipeline.Fit(data); | ||
|
||
// Inspect the model schema, and verify that a Score column is produced. | ||
var outputSchema = model.GetOutputSchema(data.Schema); | ||
var columnNames = new string[outputSchema.Count]; | ||
int i = 0; | ||
foreach (var column in outputSchema) | ||
columnNames[i++] = column.Name; | ||
Assert.Contains("Score", columnNames); | ||
} | ||
|
||
/// <summary> | ||
/// Debugging: The schema read in can be verified by inspecting the data. | ||
/// </summary> | ||
[Fact] | ||
public void InspectSchemaUponLoadingData() | ||
{ | ||
var mlContext = new MLContext(seed: 1); | ||
|
||
// Get the dataset. | ||
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); | ||
|
||
// Verify the column names. | ||
int i = 0; | ||
foreach (var column in data.Schema) | ||
{ | ||
if (i == 0) | ||
Assert.Equal("Label", column.Name); | ||
else | ||
Assert.Equal(HousingRegression.Features[i-1], column.Name); | ||
i++; | ||
} | ||
|
||
// Verify that I can cast it to the right schema by inspecting the first row. | ||
foreach (var row in mlContext.Data.CreateEnumerable<HousingRegression>(mlContext.Data.TakeRows(data, 1), true)) | ||
{ | ||
// Validate there was data in the row by checking that some values were not zero since zero is the default. | ||
var rowSum = row.MedianHomeValue; | ||
foreach (var property in HousingRegression.Features) | ||
rowSum += (float) row.GetType().GetProperty(property).GetValue(row, null); | ||
|
||
Assert.NotEqual(0, rowSum); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Debugging: The progress of training can be accessed. | ||
/// </summary> | ||
[Fact] | ||
public void ViewTrainingOutput() | ||
{ | ||
var mlContext = new MLContext(seed: 1); | ||
|
||
// Attach a listener. | ||
var logWatcher = new LogWatcher(); | ||
mlContext.Log += logWatcher.ObserveEvent; | ||
|
||
// Get the dataset. | ||
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); | ||
|
||
// Define a pipeline | ||
var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) | ||
.Append(mlContext.Transforms.Normalize()) | ||
.AppendCacheCheckpoint(mlContext) | ||
.Append(mlContext.Regression.Trainers.Sdca( | ||
new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); | ||
|
||
// Fit the pipeline to the data. | ||
var model = pipeline.Fit(data); | ||
|
||
// Validate that we can read lines from the file. | ||
var expectedLines = new string[3] { | ||
@"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L2 = 0.001.", | ||
@"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L1Threshold (L1/L2) = 0.", | ||
@"[Source=SdcaTrainerBase; Training, Kind=Info] Using best model from iteration 7."}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this text guaranteed to be constant, across runs and OSs? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question! Fixed seed gives consistency across runs; tests pass across OS signifies that that is guaranteed too. In reply to: 265703890 [](ancestors = 265703890) |
||
foreach (var line in expectedLines) | ||
{ | ||
Assert.Contains(line, logWatcher.Lines); | ||
Assert.Equal(1, logWatcher.Lines[line]); | ||
} | ||
} | ||
|
||
internal class LogWatcher { | ||
|
||
public readonly IDictionary<string, int> Lines; | ||
|
||
public LogWatcher() | ||
{ | ||
Lines = new Dictionary<string, int>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
do we need a dictionary for this ? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. I want to hold all the lines, keep a count of how many times I see a unique set of characters, and have the lookup be quick. A hash seems like the way to go, and a What do you think? I could also reverse the lookup, so that we have a dictionary with the lines of choice in them, and use In reply to: 265409282 [](ancestors = 265409282) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll keep as is because perf tradeoff is nominal. In reply to: 265410723 [](ancestors = 265410723,265409282) |
||
} | ||
|
||
public void ObserveEvent(object sender, LoggingEventArgs e) | ||
{ | ||
if (Lines.ContainsKey(e.Message)) | ||
Lines[e.Message]++; | ||
else | ||
Lines[e.Message] = 1; | ||
} | ||
} | ||
} | ||
} |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tiny little cakes?
In reply to: 265409347 [](ancestors = 265409347)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lol, Tender Loving Care
In reply to: 265411100 [](ancestors = 265411100,265409347)