From 95b34efcd3f66221f1ed1e8cb0e193f2899e8de1 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 17 Oct 2018 16:36:44 -0700 Subject: [PATCH 01/13] Placeholder methods for all P1 transforms, InputClass for the SentimentData and sample for the text transform. --- .../DynamicTransformers.cs | 71 +++++++++++++++++++ docs/samples/Microsoft.ML.Samples/Program.cs | 5 +- .../SamplesDatasetUtils.cs | 16 +++++ 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs diff --git a/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs b/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs new file mode 100644 index 0000000000..128abf630c --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs @@ -0,0 +1,71 @@ +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Data; +using System; +using System.Linq; +using System.Collections.Generic; +using Microsoft.ML.StaticPipe; + +namespace Microsoft.ML.Samples +{ + public class DynamicTransformers + { + + public static void KeyToValue() + { + + } + + public static void Concat() + { + + } + + public static void Term() + { + + } + + public static void TextTransform() + { + // Create a new environment for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var env = new LocalEnvironment(); + + IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); + + // Preview of the data. + // Sentiment SentimentText + // true This is the best game I've ever played. + // false ==RUDE== Dude, you are rude upload that picture back, or else. + // true Until the next game comes out, this game is undisputedly the best Xbox game of all time + + var trainData = env.CreateStreamingDataView(data); + + var learningPipeline = new TextTransform(env, "SentimentText", "TextFeatures"); + + var transformedData = learningPipeline.Fit(trainData).Transform(trainData); + + var textFeaturesColumn = transformedData.GetColumn>(env, "TextFeatures").ToArray(); + + // Preview of the transformedData. + Console.WriteLine("TextFeatures column obtained post-transformation."); + foreach (var featureRow in textFeaturesColumn) + { + foreach (var value in featureRow.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + //Transformed data + // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 + } + + public static void MinMaxNormalizer() + { + + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index c83d9ee898..54a87cee09 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -8,8 +8,9 @@ internal static class Program { static void Main(string[] args) { - Trainers.SdcaRegression(); - Transformers.ConcatEstimator(); + //Trainers.SdcaRegression(); + //Transformers.ConcatEstimator(); + DynamicTransformers.TextTransform(); } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 7a3a887524..9e36c4845c 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -43,5 +43,21 @@ public static IEnumerable GetInputData() return data; } + + public class SampleSentimentData + { + public bool Sentiment { get; set; } + public string SentimentText { get; set; } + } + + public static IEnumerable GetSentimentData() + { + var data = new List(); + data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Best game I've ever played." }); + data.Add(new SampleSentimentData { Sentiment = false, SentimentText = "==RUDE== Dude" }); + data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Until the next game, this is the best Xbox game!" }); + + return data; + } } } From f6578a655df56edfd00c2dd0e00dd532a9a762a6 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 18 Oct 2018 16:07:39 -0700 Subject: [PATCH 02/13] adding airquality, infert datasets. Added samples for Term and KeyToVal estimators. --- .../ConvertCatalogTransformers.cs | 106 ++++++ .../DynamicTransformers.cs | 71 ---- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../TextCatalogTransformers.cs | 97 +++++ .../SamplesDatasetUtils.cs | 20 +- .../TextTransformCatalog.cs | 2 + test/data/gplv2/COPYING.txt | 345 ++++++++++++++++++ test/data/gplv2/airquality.csv | 154 ++++++++ test/data/gplv2/infert.csv | 249 +++++++++++++ test/data/topics.csv | 9 + 10 files changed, 982 insertions(+), 73 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs create mode 100644 docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs create mode 100644 test/data/gplv2/COPYING.txt create mode 100644 test/data/gplv2/airquality.csv create mode 100644 test/data/gplv2/infert.csv create mode 100644 test/data/topics.csv diff --git a/docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs b/docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs new file mode 100644 index 0000000000..cdc63f4ea0 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs @@ -0,0 +1,106 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Data; +using System; +using System.Linq; +using System.Collections.Generic; +using Microsoft.ML.Transforms; + +namespace Microsoft.ML.Samples +{ + public class ConvertCatalogTransformers + { + public static void KeyToValue_Term() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(seed: 1, conc: 1); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // Review ReviewReverse, Label + // "animals birds cats dogs fish horse", "radiation galaxy universe duck", 1 + // "horse birds house fish duck cats", "space galaxy universe radiation", 0 + // "car truck driver bus pickup", "bus pickup", 1 + // "car truck driver bus pickup horse", "car truck", 0 + + // A pipeline to convert the terms of the review_reverse column in + // making use of default settings. + string defaultColumnName = "DefaultKeys"; + // REVIEW create through the catalog extension + var default_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") + .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); + + // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. + string customizedColumnName = "CustomizedKeys"; + var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") + .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 3, sort:TermTransform.SortOrder.Value)); + + // The transformed data. + var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); + var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); + + // small helper to print the text inside the columns, in the console. + Action[]> printHelper = (columnName, column) => + { + Console.WriteLine($"{columnName} column obtained post-transformation."); + foreach (var row in column) + { + foreach (var value in row.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + }; + + // Preview of the TextFeatures column obtained after processing the input. + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName).ToArray(); + printHelper(defaultColumnName, defaultColumn); + + // DefaultKeys column obtained post-transformation + // 8 9 3 1 + // 8 9 3 1 + // 8 9 3 1 + // 8 9 3 1 + + // Preview of the TextFeatures column obtained after processing the input. + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName).ToArray(); + printHelper(customizedColumnName, customizedColumn); + + // CustomizedKeys column obtained post-transformation. + // 0 1 3 2 + // 0 1 3 2 + // 0 1 3 2 + // 0 1 3 2 + + // retrieve the original values, by appending the KeyToValue etimator to the existing pipelines + var pipeline = default_pipeline.Append(new KeyToValueEstimator(ml, defaultColumnName)); + + // The transformed data. + transformedData_default = pipeline.Fit(trainData).Transform(trainData); + + // Preview of the TextFeatures column obtained after processing the input. + var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName).ToArray(); + + foreach (var row in originalColumnBack) + { + foreach (var value in row.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + // car truck universe radiation + // car truck universe radiation + // car truck universe radiation + // car truck universe radiation + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs b/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs deleted file mode 100644 index 128abf630c..0000000000 --- a/docs/samples/Microsoft.ML.Samples/DynamicTransformers.cs +++ /dev/null @@ -1,71 +0,0 @@ -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.Api; -using Microsoft.ML.Data; -using System; -using System.Linq; -using System.Collections.Generic; -using Microsoft.ML.StaticPipe; - -namespace Microsoft.ML.Samples -{ - public class DynamicTransformers - { - - public static void KeyToValue() - { - - } - - public static void Concat() - { - - } - - public static void Term() - { - - } - - public static void TextTransform() - { - // Create a new environment for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var env = new LocalEnvironment(); - - IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); - - // Preview of the data. - // Sentiment SentimentText - // true This is the best game I've ever played. - // false ==RUDE== Dude, you are rude upload that picture back, or else. - // true Until the next game comes out, this game is undisputedly the best Xbox game of all time - - var trainData = env.CreateStreamingDataView(data); - - var learningPipeline = new TextTransform(env, "SentimentText", "TextFeatures"); - - var transformedData = learningPipeline.Fit(trainData).Transform(trainData); - - var textFeaturesColumn = transformedData.GetColumn>(env, "TextFeatures").ToArray(); - - // Preview of the transformedData. - Console.WriteLine("TextFeatures column obtained post-transformation."); - foreach (var featureRow in textFeaturesColumn) - { - foreach (var value in featureRow.Values) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - //Transformed data - // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 - } - - public static void MinMaxNormalizer() - { - - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 54a87cee09..7881a11bbc 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -10,7 +10,7 @@ static void Main(string[] args) { //Trainers.SdcaRegression(); //Transformers.ConcatEstimator(); - DynamicTransformers.TextTransform(); + ConvertCatalogTransformers.KeyToValue_Term(); } } } diff --git a/docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs b/docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs new file mode 100644 index 0000000000..7e5498cd54 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs @@ -0,0 +1,97 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Data; +using System; +using System.Linq; +using System.Collections.Generic; + +namespace Microsoft.ML.Samples +{ + public class TextCatalogTransformers + { + + + + public static void Concat() + { + + } + + public static void TextTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(seed: 1, conc: 1); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // Sentiment SentimentText + // true Best game I've ever played. + // false ==RUDE== Dude, 2. + // true Until the next game, this is the best Xbox game! + + // A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "TextFeatures" + // making use of default settings. + string defaultColumnName = "DefaultTextFeatures"; + var default_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", defaultColumnName); + + // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. + string customizedColumnName = "CustomizedTextFeatures"; + var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s=> + { + s.KeepPunctuations = false; + s.KeepNumbers = false; + s.OutputTokens = true; + s.TextLanguage = Runtime.Data.TextTransform.Language.English; // supports English, French, German, Dutch, Italian, Spanish, Japanese + }); + + // The transformed data. + var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); + var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); + + // small helper to print the text inside the columns, in the console. + Action[]> printHelper = (columnName, column) => + { + Console.WriteLine($"{columnName} column obtained post-transformation."); + foreach (var featureRow in column) + { + foreach (var value in featureRow.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + }; + + // Preview of the TextFeatures column obtained after processing the input. + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName).ToArray(); + printHelper(defaultColumnName, defaultColumn); + + // Transformed data REVIEW: why are the first two lines identical? Log a bug. + // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 + + // Preview of the TextFeatures column obtained after processing the input. + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName).ToArray(); + printHelper(customizedColumnName, customizedColumn); + + // Transformed data + // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 + } + + public static void MinMaxNormalizer() + { + + } + } +} diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 9e36c4845c..c5edebd7a7 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -54,10 +54,28 @@ public static IEnumerable GetSentimentData() { var data = new List(); data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Best game I've ever played." }); - data.Add(new SampleSentimentData { Sentiment = false, SentimentText = "==RUDE== Dude" }); + data.Add(new SampleSentimentData { Sentiment = false, SentimentText = "==RUDE== Dude, 2" }); data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Until the next game, this is the best Xbox game!" }); return data; } + + public class SampleTopicsData + { + public string Review { get; set; } + public string ReviewReverse { get; set; } + public bool Label { get; set; } + } + + public static IEnumerable GetTopicsData() + { + var data = new List(); + data.Add(new SampleTopicsData { Review = "animals birds cats dogs fish horse", ReviewReverse = "radiation galaxy universe duck", Label = true }); + data.Add(new SampleTopicsData { Review = "horse birds house fish duck cats", ReviewReverse = "space galaxy universe radiation", Label = false }); + data.Add(new SampleTopicsData { Review = "car truck driver bus pickup", ReviewReverse = "bus pickup", Label = true}); + data.Add(new SampleTopicsData { Review = "car truck driver bus pickup horse", ReviewReverse = "car truck", Label = false }); + + return data; + } } } diff --git a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs index 11557c82c3..b06a3faa73 100644 --- a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs +++ b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs @@ -18,6 +18,8 @@ public static class TextTransformCatalog /// The input column /// The output column /// Advanced transform settings + /// + /// public static TextTransform FeaturizeText(this TransformsCatalog.TextTransforms catalog, string inputColumn, string outputColumn = null, Action advancedSettings = null) diff --git a/test/data/gplv2/COPYING.txt b/test/data/gplv2/COPYING.txt new file mode 100644 index 0000000000..c4aa571b70 --- /dev/null +++ b/test/data/gplv2/COPYING.txt @@ -0,0 +1,345 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. \ No newline at end of file diff --git a/test/data/gplv2/airquality.csv b/test/data/gplv2/airquality.csv new file mode 100644 index 0000000000..5448cc6a05 --- /dev/null +++ b/test/data/gplv2/airquality.csv @@ -0,0 +1,154 @@ +"","Ozone","Solar_R","Wind","Temp","Month","Day" +"1",41,190,7.4,67,5,1 +"2",36,118,8,72,5,2 +"3",12,149,12.6,74,5,3 +"4",18,313,11.5,62,5,4 +"5",NA,NA,14.3,56,5,5 +"6",28,NA,14.9,66,5,6 +"7",23,299,8.6,65,5,7 +"8",19,99,13.8,59,5,8 +"9",8,19,20.1,61,5,9 +"10",NA,194,8.6,69,5,10 +"11",7,NA,6.9,74,5,11 +"12",16,256,9.7,69,5,12 +"13",11,290,9.2,66,5,13 +"14",14,274,10.9,68,5,14 +"15",18,65,13.2,58,5,15 +"16",14,334,11.5,64,5,16 +"17",34,307,12,66,5,17 +"18",6,78,18.4,57,5,18 +"19",30,322,11.5,68,5,19 +"20",11,44,9.7,62,5,20 +"21",1,8,9.7,59,5,21 +"22",11,320,16.6,73,5,22 +"23",4,25,9.7,61,5,23 +"24",32,92,12,61,5,24 +"25",NA,66,16.6,57,5,25 +"26",NA,266,14.9,58,5,26 +"27",NA,NA,8,57,5,27 +"28",23,13,12,67,5,28 +"29",45,252,14.9,81,5,29 +"30",115,223,5.7,79,5,30 +"31",37,279,7.4,76,5,31 +"32",NA,286,8.6,78,6,1 +"33",NA,287,9.7,74,6,2 +"34",NA,242,16.1,67,6,3 +"35",NA,186,9.2,84,6,4 +"36",NA,220,8.6,85,6,5 +"37",NA,264,14.3,79,6,6 +"38",29,127,9.7,82,6,7 +"39",NA,273,6.9,87,6,8 +"40",71,291,13.8,90,6,9 +"41",39,323,11.5,87,6,10 +"42",NA,259,10.9,93,6,11 +"43",NA,250,9.2,92,6,12 +"44",23,148,8,82,6,13 +"45",NA,332,13.8,80,6,14 +"46",NA,322,11.5,79,6,15 +"47",21,191,14.9,77,6,16 +"48",37,284,20.7,72,6,17 +"49",20,37,9.2,65,6,18 +"50",12,120,11.5,73,6,19 +"51",13,137,10.3,76,6,20 +"52",NA,150,6.3,77,6,21 +"53",NA,59,1.7,76,6,22 +"54",NA,91,4.6,76,6,23 +"55",NA,250,6.3,76,6,24 +"56",NA,135,8,75,6,25 +"57",NA,127,8,78,6,26 +"58",NA,47,10.3,73,6,27 +"59",NA,98,11.5,80,6,28 +"60",NA,31,14.9,77,6,29 +"61",NA,138,8,83,6,30 +"62",135,269,4.1,84,7,1 +"63",49,248,9.2,85,7,2 +"64",32,236,9.2,81,7,3 +"65",NA,101,10.9,84,7,4 +"66",64,175,4.6,83,7,5 +"67",40,314,10.9,83,7,6 +"68",77,276,5.1,88,7,7 +"69",97,267,6.3,92,7,8 +"70",97,272,5.7,92,7,9 +"71",85,175,7.4,89,7,10 +"72",NA,139,8.6,82,7,11 +"73",10,264,14.3,73,7,12 +"74",27,175,14.9,81,7,13 +"75",NA,291,14.9,91,7,14 +"76",7,48,14.3,80,7,15 +"77",48,260,6.9,81,7,16 +"78",35,274,10.3,82,7,17 +"79",61,285,6.3,84,7,18 +"80",79,187,5.1,87,7,19 +"81",63,220,11.5,85,7,20 +"82",16,7,6.9,74,7,21 +"83",NA,258,9.7,81,7,22 +"84",NA,295,11.5,82,7,23 +"85",80,294,8.6,86,7,24 +"86",108,223,8,85,7,25 +"87",20,81,8.6,82,7,26 +"88",52,82,12,86,7,27 +"89",82,213,7.4,88,7,28 +"90",50,275,7.4,86,7,29 +"91",64,253,7.4,83,7,30 +"92",59,254,9.2,81,7,31 +"93",39,83,6.9,81,8,1 +"94",9,24,13.8,81,8,2 +"95",16,77,7.4,82,8,3 +"96",78,NA,6.9,86,8,4 +"97",35,NA,7.4,85,8,5 +"98",66,NA,4.6,87,8,6 +"99",122,255,4,89,8,7 +"100",89,229,10.3,90,8,8 +"101",110,207,8,90,8,9 +"102",NA,222,8.6,92,8,10 +"103",NA,137,11.5,86,8,11 +"104",44,192,11.5,86,8,12 +"105",28,273,11.5,82,8,13 +"106",65,157,9.7,80,8,14 +"107",NA,64,11.5,79,8,15 +"108",22,71,10.3,77,8,16 +"109",59,51,6.3,79,8,17 +"110",23,115,7.4,76,8,18 +"111",31,244,10.9,78,8,19 +"112",44,190,10.3,78,8,20 +"113",21,259,15.5,77,8,21 +"114",9,36,14.3,72,8,22 +"115",NA,255,12.6,75,8,23 +"116",45,212,9.7,79,8,24 +"117",168,238,3.4,81,8,25 +"118",73,215,8,86,8,26 +"119",NA,153,5.7,88,8,27 +"120",76,203,9.7,97,8,28 +"121",118,225,2.3,94,8,29 +"122",84,237,6.3,96,8,30 +"123",85,188,6.3,94,8,31 +"124",96,167,6.9,91,9,1 +"125",78,197,5.1,92,9,2 +"126",73,183,2.8,93,9,3 +"127",91,189,4.6,93,9,4 +"128",47,95,7.4,87,9,5 +"129",32,92,15.5,84,9,6 +"130",20,252,10.9,80,9,7 +"131",23,220,10.3,78,9,8 +"132",21,230,10.9,75,9,9 +"133",24,259,9.7,73,9,10 +"134",44,236,14.9,81,9,11 +"135",21,259,15.5,76,9,12 +"136",28,238,6.3,77,9,13 +"137",9,24,10.9,71,9,14 +"138",13,112,11.5,71,9,15 +"139",46,237,6.9,78,9,16 +"140",18,224,13.8,67,9,17 +"141",13,27,10.3,76,9,18 +"142",24,238,10.3,68,9,19 +"143",16,201,8,82,9,20 +"144",13,238,12.6,64,9,21 +"145",23,14,9.2,71,9,22 +"146",36,139,10.3,81,9,23 +"147",7,49,10.3,69,9,24 +"148",14,20,16.6,63,9,25 +"149",30,193,6.9,70,9,26 +"150",NA,145,13.2,77,9,27 +"151",14,191,14.3,75,9,28 +"152",18,131,8,76,9,29 +"153",20,223,11.5,68,9,30 \ No newline at end of file diff --git a/test/data/gplv2/infert.csv b/test/data/gplv2/infert.csv new file mode 100644 index 0000000000..4f4fc2ec51 --- /dev/null +++ b/test/data/gplv2/infert.csv @@ -0,0 +1,249 @@ +"row_num","education","age","parity","induced","case","spontaneous","stratum","pooled.stratum" +"1","0-5yrs",26,6,1,1,2,1,3 +"2","0-5yrs",42,1,1,1,0,2,1 +"3","0-5yrs",39,6,2,1,0,3,4 +"4","0-5yrs",34,4,2,1,0,4,2 +"5","6-11yrs",35,3,1,1,1,5,32 +"6","6-11yrs",36,4,2,1,1,6,36 +"7","6-11yrs",23,1,0,1,0,7,6 +"8","6-11yrs",32,2,0,1,0,8,22 +"9","6-11yrs",21,1,0,1,1,9,5 +"10","6-11yrs",28,2,0,1,0,10,19 +"11","6-11yrs",29,2,1,1,0,11,20 +"12","6-11yrs",37,4,2,1,1,12,37 +"13","6-11yrs",31,1,1,1,0,13,9 +"14","6-11yrs",29,3,2,1,0,14,29 +"15","6-11yrs",31,2,1,1,1,15,21 +"16","6-11yrs",27,2,2,1,0,16,18 +"17","6-11yrs",30,5,2,1,1,17,38 +"18","6-11yrs",26,1,0,1,1,18,7 +"19","6-11yrs",25,3,2,1,1,19,28 +"20","6-11yrs",44,1,0,1,1,20,17 +"21","6-11yrs",40,1,0,1,1,21,14 +"22","6-11yrs",35,2,2,1,0,22,24 +"23","6-11yrs",28,2,0,1,2,23,19 +"24","6-11yrs",36,1,0,1,1,24,12 +"25","6-11yrs",27,2,1,1,1,25,18 +"26","6-11yrs",40,2,0,1,2,26,27 +"27","6-11yrs",38,2,0,1,2,27,26 +"28","6-11yrs",34,3,0,1,2,28,31 +"29","6-11yrs",28,4,1,1,2,29,34 +"30","6-11yrs",30,4,2,1,0,30,35 +"31","6-11yrs",32,1,0,1,1,31,10 +"32","6-11yrs",34,2,1,1,0,32,23 +"33","6-11yrs",42,1,1,1,0,33,16 +"34","6-11yrs",32,2,0,1,2,34,22 +"35","6-11yrs",39,1,1,1,0,35,13 +"36","6-11yrs",35,2,0,1,2,36,24 +"37","6-11yrs",36,1,0,1,1,37,12 +"38","6-11yrs",34,3,1,1,2,38,31 +"39","6-11yrs",30,3,0,1,0,39,30 +"40","6-11yrs",28,1,0,1,1,40,8 +"41","6-11yrs",39,3,0,1,2,41,33 +"42","6-11yrs",35,1,0,1,0,42,11 +"43","6-11yrs",41,1,0,1,0,43,15 +"44","6-11yrs",37,2,1,1,1,44,25 +"45","12+ yrs",30,1,0,1,0,45,44 +"46","12+ yrs",37,1,1,1,0,46,48 +"47","12+ yrs",28,2,0,1,2,47,51 +"48","12+ yrs",27,4,2,1,0,48,61 +"49","12+ yrs",26,2,2,1,0,49,49 +"50","12+ yrs",38,3,0,1,2,50,60 +"51","12+ yrs",24,3,1,1,2,51,56 +"52","12+ yrs",36,5,1,1,2,52,62 +"53","12+ yrs",27,3,1,1,1,53,57 +"54","12+ yrs",28,1,0,1,1,54,42 +"55","12+ yrs",29,2,0,1,2,55,52 +"56","12+ yrs",36,2,0,1,2,56,55 +"57","12+ yrs",28,2,1,1,0,57,51 +"58","12+ yrs",28,2,0,1,2,58,51 +"59","12+ yrs",28,1,0,1,1,59,42 +"60","12+ yrs",27,2,0,1,2,60,50 +"61","12+ yrs",35,2,0,1,2,61,54 +"62","12+ yrs",25,1,0,1,1,62,41 +"63","12+ yrs",34,1,0,1,1,63,47 +"64","12+ yrs",31,2,0,1,2,64,53 +"65","12+ yrs",26,2,1,1,0,65,49 +"66","12+ yrs",32,1,0,1,1,66,46 +"67","12+ yrs",21,1,0,1,1,67,39 +"68","12+ yrs",28,3,1,1,2,68,58 +"69","12+ yrs",37,3,0,1,2,69,59 +"70","12+ yrs",25,1,1,1,0,70,41 +"71","12+ yrs",32,1,1,1,0,71,46 +"72","12+ yrs",25,1,0,1,1,72,41 +"73","12+ yrs",31,1,0,1,1,73,45 +"74","12+ yrs",38,6,0,1,2,74,63 +"75","12+ yrs",26,2,0,1,2,75,49 +"76","12+ yrs",31,1,0,1,1,76,45 +"77","12+ yrs",31,2,0,1,1,77,53 +"78","12+ yrs",25,1,1,1,0,78,41 +"79","12+ yrs",31,1,0,1,1,79,45 +"80","12+ yrs",34,1,0,1,1,80,47 +"81","12+ yrs",35,2,2,1,0,81,54 +"82","12+ yrs",29,1,0,1,1,82,43 +"83","12+ yrs",23,1,0,1,1,83,40 +"84","0-5yrs",26,6,2,0,0,1,3 +"85","0-5yrs",42,1,0,0,0,2,1 +"86","0-5yrs",39,6,2,0,0,3,4 +"87","0-5yrs",34,4,0,0,1,4,2 +"88","6-11yrs",35,3,2,0,0,5,32 +"89","6-11yrs",36,4,1,0,1,6,36 +"90","6-11yrs",23,1,0,0,0,7,6 +"91","6-11yrs",32,2,2,0,0,8,22 +"92","6-11yrs",21,1,0,0,1,9,5 +"93","6-11yrs",28,2,0,0,1,10,19 +"94","6-11yrs",29,2,0,0,0,11,20 +"95","6-11yrs",37,4,1,0,1,12,37 +"96","6-11yrs",31,1,0,0,0,13,9 +"97","6-11yrs",29,3,0,0,1,14,29 +"98","6-11yrs",31,2,1,0,0,15,21 +"99","6-11yrs",27,2,1,0,0,16,18 +"100","6-11yrs",30,5,0,0,2,17,38 +"101","6-11yrs",26,1,0,0,0,18,7 +"102","6-11yrs",25,3,0,0,1,19,28 +"103","6-11yrs",44,1,0,0,0,20,17 +"104","6-11yrs",40,1,0,0,0,21,14 +"105","6-11yrs",35,2,0,0,0,22,24 +"106","6-11yrs",28,2,0,0,0,23,19 +"107","6-11yrs",36,1,0,0,0,24,12 +"108","6-11yrs",27,2,0,0,1,25,18 +"109","6-11yrs",40,2,0,0,0,26,27 +"110","6-11yrs",38,2,0,0,0,27,26 +"111","6-11yrs",34,3,0,0,0,28,31 +"112","6-11yrs",28,4,0,0,2,29,34 +"113","6-11yrs",30,4,1,0,1,30,35 +"114","6-11yrs",32,1,0,0,0,31,10 +"115","6-11yrs",34,2,1,0,0,32,23 +"116","6-11yrs",42,1,1,0,0,33,16 +"117","6-11yrs",32,2,0,0,0,34,22 +"118","6-11yrs",39,1,0,0,0,35,13 +"119","6-11yrs",35,2,0,0,0,36,24 +"120","6-11yrs",36,1,0,0,0,37,12 +"121","6-11yrs",34,3,2,0,0,38,31 +"122","6-11yrs",30,3,0,0,2,39,30 +"123","6-11yrs",28,1,1,0,0,40,8 +"124","6-11yrs",39,3,1,0,0,41,33 +"125","6-11yrs",35,1,0,0,0,42,11 +"126","6-11yrs",41,1,0,0,0,43,15 +"127","6-11yrs",37,2,0,0,0,44,25 +"128","12+ yrs",30,1,1,0,0,45,44 +"129","12+ yrs",37,1,0,0,0,46,48 +"130","12+ yrs",28,2,1,0,0,47,51 +"131","12+ yrs",27,4,2,0,1,48,61 +"132","12+ yrs",26,2,1,0,0,49,49 +"133","12+ yrs",38,3,1,0,0,50,60 +"134","12+ yrs",24,3,2,0,1,51,56 +"135","12+ yrs",36,5,1,0,1,52,62 +"136","12+ yrs",27,3,1,0,1,53,57 +"137","12+ yrs",28,1,1,0,0,54,42 +"138","12+ yrs",29,2,1,0,0,55,52 +"139","12+ yrs",36,2,1,0,0,56,55 +"140","12+ yrs",28,2,1,0,1,57,51 +"141","12+ yrs",28,2,2,0,0,58,51 +"142","12+ yrs",28,1,1,0,0,59,42 +"143","12+ yrs",27,2,1,0,0,60,50 +"144","12+ yrs",35,2,2,0,0,61,54 +"145","12+ yrs",25,1,1,0,0,62,41 +"146","12+ yrs",34,1,0,0,0,63,47 +"147","12+ yrs",31,2,0,0,0,64,53 +"148","12+ yrs",26,2,0,0,1,65,49 +"149","12+ yrs",32,1,0,0,0,66,46 +"150","12+ yrs",21,1,0,0,1,67,39 +"151","12+ yrs",28,3,2,0,0,68,58 +"152","12+ yrs",37,3,1,0,1,69,59 +"153","12+ yrs",25,1,0,0,0,70,41 +"154","12+ yrs",32,1,1,0,0,71,46 +"155","12+ yrs",25,1,0,0,0,72,41 +"156","12+ yrs",31,1,0,0,1,73,45 +"157","12+ yrs",26,2,0,0,2,75,49 +"158","12+ yrs",31,1,0,0,0,76,45 +"159","12+ yrs",31,2,2,0,0,77,53 +"160","12+ yrs",25,1,0,0,0,78,41 +"161","12+ yrs",31,1,0,0,0,79,45 +"162","12+ yrs",34,1,0,0,0,80,47 +"163","12+ yrs",35,2,0,0,0,81,54 +"164","12+ yrs",29,1,0,0,1,82,43 +"165","12+ yrs",23,1,0,0,1,83,40 +"166","0-5yrs",26,6,2,0,0,1,3 +"167","0-5yrs",42,1,0,0,0,2,1 +"168","0-5yrs",39,6,2,0,0,3,4 +"169","0-5yrs",34,4,0,0,2,4,2 +"170","6-11yrs",35,3,0,0,0,5,32 +"171","6-11yrs",36,4,0,0,2,6,36 +"172","6-11yrs",23,1,0,0,0,7,6 +"173","6-11yrs",32,2,0,0,1,8,22 +"174","6-11yrs",21,1,1,0,0,9,5 +"175","6-11yrs",28,2,0,0,1,10,19 +"176","6-11yrs",29,2,0,0,1,11,20 +"177","6-11yrs",37,4,0,0,1,12,37 +"178","6-11yrs",31,1,0,0,0,13,9 +"179","6-11yrs",29,3,0,0,2,14,29 +"180","6-11yrs",31,2,1,0,0,15,21 +"181","6-11yrs",27,2,0,0,0,16,18 +"182","6-11yrs",30,5,1,0,2,17,38 +"183","6-11yrs",26,1,1,0,0,18,7 +"184","6-11yrs",25,3,1,0,1,19,28 +"185","6-11yrs",44,1,1,0,0,20,17 +"186","6-11yrs",40,1,0,0,0,21,14 +"187","6-11yrs",35,2,0,0,0,22,24 +"188","6-11yrs",28,2,2,0,0,23,19 +"189","6-11yrs",36,1,0,0,1,24,12 +"190","6-11yrs",27,2,0,0,2,25,18 +"191","6-11yrs",40,2,0,0,0,26,27 +"192","6-11yrs",38,2,0,0,0,27,26 +"193","6-11yrs",34,3,0,0,0,28,31 +"194","6-11yrs",28,4,2,0,1,29,34 +"195","6-11yrs",30,4,1,0,1,30,35 +"196","6-11yrs",32,1,0,0,0,31,10 +"197","6-11yrs",34,2,0,0,0,32,23 +"198","6-11yrs",42,1,0,0,0,33,16 +"199","6-11yrs",32,2,2,0,0,34,22 +"200","6-11yrs",39,1,0,0,0,35,13 +"201","6-11yrs",35,2,0,0,0,36,24 +"202","6-11yrs",36,1,0,0,0,37,12 +"203","6-11yrs",34,3,2,0,0,38,31 +"204","6-11yrs",30,3,0,0,1,39,30 +"205","6-11yrs",28,1,0,0,0,40,8 +"206","6-11yrs",39,3,0,0,0,41,33 +"207","6-11yrs",35,1,0,0,0,42,11 +"208","6-11yrs",41,1,0,0,0,43,15 +"209","6-11yrs",37,2,0,0,0,44,25 +"210","12+ yrs",30,1,0,0,0,45,44 +"211","12+ yrs",37,1,0,0,1,46,48 +"212","12+ yrs",28,2,1,0,0,47,51 +"213","12+ yrs",27,4,2,0,0,48,61 +"214","12+ yrs",26,2,1,0,0,49,49 +"215","12+ yrs",38,3,1,0,0,50,60 +"216","12+ yrs",24,3,2,0,0,51,56 +"217","12+ yrs",36,5,2,0,1,52,62 +"218","12+ yrs",27,3,2,0,0,53,57 +"219","12+ yrs",28,1,0,0,1,54,42 +"220","12+ yrs",29,2,1,0,1,55,52 +"221","12+ yrs",36,2,0,0,1,56,55 +"222","12+ yrs",28,2,2,0,0,57,51 +"223","12+ yrs",28,2,1,0,0,58,51 +"224","12+ yrs",28,1,0,0,0,59,42 +"225","12+ yrs",27,2,1,0,0,60,50 +"226","12+ yrs",35,2,1,0,0,61,54 +"227","12+ yrs",25,1,1,0,0,62,41 +"228","12+ yrs",34,1,0,0,0,63,47 +"229","12+ yrs",31,2,1,0,0,64,53 +"230","12+ yrs",26,2,0,0,2,65,49 +"231","12+ yrs",32,1,1,0,0,66,46 +"232","12+ yrs",21,1,0,0,0,67,39 +"233","12+ yrs",28,3,2,0,0,68,58 +"234","12+ yrs",37,3,0,0,2,69,59 +"235","12+ yrs",25,1,1,0,0,70,41 +"236","12+ yrs",32,1,0,0,0,71,46 +"237","12+ yrs",25,1,1,0,0,72,41 +"238","12+ yrs",31,1,0,0,0,73,45 +"239","12+ yrs",38,6,0,0,2,74,63 +"240","12+ yrs",26,2,1,0,1,75,49 +"241","12+ yrs",31,1,1,0,0,76,45 +"242","12+ yrs",31,2,0,0,1,77,53 +"243","12+ yrs",25,1,0,0,1,78,41 +"244","12+ yrs",31,1,0,0,1,79,45 +"245","12+ yrs",34,1,0,0,0,80,47 +"246","12+ yrs",35,2,2,0,0,81,54 +"247","12+ yrs",29,1,0,0,1,82,43 +"248","12+ yrs",23,1,0,0,1,83,40 \ No newline at end of file diff --git a/test/data/topics.csv b/test/data/topics.csv new file mode 100644 index 0000000000..014612d2e8 --- /dev/null +++ b/test/data/topics.csv @@ -0,0 +1,9 @@ +review,review_reverse,label +"animals birds cats dogs fish horse","radiation galaxy universe duck",1 +"horse birds house fish duck cats","space galaxy universe radiation",0 +"car truck driver bus pickup","bus pickup",1 +"car truck driver bus pickup horse","car truck",0 +"car truck","car truck driver bus pickup horse",1 +"bus pickup","car truck driver bus pickup",1 +"space galaxy universe radiation","horse birds house fish duck cats",1 +"radiation galaxy universe duck","animals birds cats dogs fish horse",1 \ No newline at end of file From 161c0d33bbd3a5966d5c217d54a03fc46b37131c Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 19 Oct 2018 13:35:04 -0700 Subject: [PATCH 03/13] Adding tests for the NormalizerCatalog and the TextTransformCatalog. Adding samples for the ConcatEstimator, and KeyToValue, Term that will need to get referenced from the respective catalogs when they happen. re-organized the samples based on static-dynamic. Renamed. --- .../Dynamic/ConcatTransform.cs | 66 ++++++++++++++ .../KeyToValue_Term.cs} | 61 ++++++------- .../Dynamic/MinMaxNormalizer.cs | 87 +++++++++++++++++++ .../TextTransform.cs} | 51 ++++------- docs/samples/Microsoft.ML.Samples/Program.cs | 6 +- .../ConcatEstimator.cs} | 8 +- .../{Trainers.cs => Static/SDCA.cs} | 4 +- .../Transforms/NormalizerCatalog.cs | 27 ++++++ .../SamplesDatasetUtils.cs | 30 +++++++ .../TextTransformCatalog.cs | 12 +++ 10 files changed, 280 insertions(+), 72 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs rename docs/samples/Microsoft.ML.Samples/{ConvertCatalogTransformers.cs => Dynamic/KeyToValue_Term.cs} (74%) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs rename docs/samples/Microsoft.ML.Samples/{TextCatalogTransformers.cs => Dynamic/TextTransform.cs} (69%) rename docs/samples/Microsoft.ML.Samples/{Transformers.cs => Static/ConcatEstimator.cs} (95%) rename docs/samples/Microsoft.ML.Samples/{Trainers.cs => Static/SDCA.cs} (97%) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs new file mode 100644 index 0000000000..cf3d47d17c --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Data; + using System; + using System.Linq; + using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + class SampleInfertDataWithFeatures + { + public VBuffer Features { get; set; } + } + + public static void ConcatTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(seed: 1, conc: 1); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // Age Case Education induced parity pooled.stratum row_num ... + // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... + // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... + // 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... + // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... + // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... + + // A pipeline for concatenating the age, parity and induced columns together in the Features column + string outputColumnName = "Features"; + var pipeline = new ConcatEstimator(ml, outputColumnName, new[] { "Age", "Parity", "Induced"}); + + // The transformed data. + var transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the data of the newly created column as an Array, and + var featuresColumn = transformedData.AsEnumerable(ml, reuseRowObject: false); + + Console.WriteLine($"{outputColumnName} column obtained post-transformation."); + foreach (var featureRow in featuresColumn) + { + foreach (var value in featureRow.Features.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + // Features + // 26 6 1 + // 42 1 1 + // 39 6 2 + // 34 4 2 + // 35 3 1 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs similarity index 74% rename from docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index cdc63f4ea0..bba0c977df 100644 --- a/docs/samples/Microsoft.ML.Samples/ConvertCatalogTransformers.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -2,17 +2,18 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.Api; -using Microsoft.ML.Data; -using System; -using System.Linq; -using System.Collections.Generic; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.Samples + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Transforms; + using System; + using System.Collections.Generic; + using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic { - public class ConvertCatalogTransformers + public partial class TransformSamples { public static void KeyToValue_Term() { @@ -41,14 +42,14 @@ public static void KeyToValue_Term() // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") - .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 3, sort:TermTransform.SortOrder.Value)); + .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); // small helper to print the text inside the columns, in the console. - Action[]> printHelper = (columnName, column) => + Action>> printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); foreach (var row in column) @@ -62,33 +63,32 @@ public static void KeyToValue_Term() }; // Preview of the TextFeatures column obtained after processing the input. - var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName).ToArray(); + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); - // DefaultKeys column obtained post-transformation - // 8 9 3 1 - // 8 9 3 1 - // 8 9 3 1 + // DefaultKeys column obtained post-transformation. + // 1 2 3 4 + // 5 2 3 1 + // 6 7 3 1 // 8 9 3 1 - // Preview of the TextFeatures column obtained after processing the input. - var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName).ToArray(); + // Previewing the CustomizedKeys column obtained after processing the input. + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); // CustomizedKeys column obtained post-transformation. - // 0 1 3 2 - // 0 1 3 2 - // 0 1 3 2 - // 0 1 3 2 + // 6 4 9 3 + // 7 4 9 6 + // 1 5 9 6 + // 2 8 9 6 // retrieve the original values, by appending the KeyToValue etimator to the existing pipelines + // to convert the keys back to the strings var pipeline = default_pipeline.Append(new KeyToValueEstimator(ml, defaultColumnName)); - - // The transformed data. transformedData_default = pipeline.Fit(trainData).Transform(trainData); - // Preview of the TextFeatures column obtained after processing the input. - var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName).ToArray(); + // Preview of the DefaultColumnName column obtained + var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName); foreach (var row in originalColumnBack) { @@ -97,9 +97,10 @@ public static void KeyToValue_Term() Console.WriteLine(""); } - // car truck universe radiation - // car truck universe radiation - // car truck universe radiation + // DefaultColumnName column obtained post-transformation. + // radiation galaxy universe duck + // space galaxy universe radiation + // bus pickup universe radiation // car truck universe radiation } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs new file mode 100644 index 0000000000..372eec15c2 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -0,0 +1,87 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Data; + using System; + using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + public static void MinMaxNormalizer() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(seed: 1, conc: 1); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // Age Case Education Induced Parity PooledStratum RowNum ... + // 26 1 0-5yrs 1 6 3 1 ... + // 42 1 0-5yrs 1 1 1 2 ... + // 39 1 0-5yrs 2 6 4 3 ... + // 34 1 0-5yrs 2 4 2 4 ... + // 35 1 6-11yrs 1 3 32 5 ... + + // A pipeline for concatenating the age, parity and induced columns together in the Features column + var pipeline = ml.Transforms.Normalizer("Induced"); + // The transformed data. + var transformedData = pipeline.Fit(trainData).Transform(trainData); + // Getting the data of the newly created column as an Array, and + var normalizedColumn = transformedData.GetColumn(ml, "Induced"); + + // A small printing utility + Action> printHelper = (colName, column) => + { + Console.WriteLine($"{colName} column obtained post-transformation."); + foreach (var row in column) + Console.WriteLine($"{row} "); + }; + + printHelper("Induced", normalizedColumn); + // Induced + // 0.5 + // 0.5 + // 1 + // 1 + // 0.5 + + // Composing a different pipeline if we wanted to normalize more than one column at a time. + // A pipeline for concatenating the age, parity and induced columns together in the new columns + // using log scale + var multiColPipeline = ml.Transforms.Normalizer(Normalizer.NormalizerMode.LogMeanVariance, new[] { ("Induced", "LogInduced"), ("Spontaneous", "LogSpontaneous") }); + // The transformed data. + var multiColtransformedData = multiColPipeline.Fit(trainData).Transform(trainData); + // Getting the data of the newly created column as an Array, and + var normalizedInduced = multiColtransformedData.GetColumn(ml, "LogInduced"); + var normalizedSpont = multiColtransformedData.GetColumn(ml, "LogSpontaneous"); + + printHelper("LogInduced", normalizedInduced); + + // LogInduced + // 0.2071445 + // 0.2071445 + // 0.889631 + // 0.889631 + // 0.2071445 + + printHelper("LogSpontaneous", normalizedSpont); + + // LogSpontaneous + // 0.8413026 + // 0 + // 0 + // 0 + // 0.1586974 + + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs similarity index 69% rename from docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs index 7e5498cd54..a5584e6187 100644 --- a/docs/samples/Microsoft.ML.Samples/TextCatalogTransformers.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs @@ -2,25 +2,17 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.Api; -using Microsoft.ML.Data; -using System; -using System.Linq; -using System.Collections.Generic; - -namespace Microsoft.ML.Samples + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Data; + using System; + using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic { - public class TextCatalogTransformers + public partial class TransformSamples { - - - - public static void Concat() - { - - } - public static void TextTransform() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, @@ -44,7 +36,7 @@ public static void TextTransform() // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. string customizedColumnName = "CustomizedTextFeatures"; - var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s=> + var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s => { s.KeepPunctuations = false; s.KeepNumbers = false; @@ -57,7 +49,7 @@ public static void TextTransform() var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); // small helper to print the text inside the columns, in the console. - Action[]> printHelper = (columnName, column) => + Action>> printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); foreach (var featureRow in column) @@ -71,27 +63,22 @@ public static void TextTransform() }; // Preview of the TextFeatures column obtained after processing the input. - var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName).ToArray(); + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); - // Transformed data REVIEW: why are the first two lines identical? Log a bug. - // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0.2581989 0.2581989 0.2581989 0.2581989 0.5163978 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.2581989 0.7071068 0.7071068 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 + // Transformed data + // 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.2357023 0.2357023 0.2357023 0.2357023 0.4714046 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.5773503 0.5773503 0.5773503 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 // Preview of the TextFeatures column obtained after processing the input. - var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName).ToArray(); + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); // Transformed data + // 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 - // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 - } - - public static void MinMaxNormalizer() - { - + // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 } } } } diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 7881a11bbc..e967de29cf 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -2,15 +2,13 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -namespace Microsoft.ML.Samples +namespace Microsoft.ML.Samples.Dynamic { internal static class Program { static void Main(string[] args) { - //Trainers.SdcaRegression(); - //Transformers.ConcatEstimator(); - ConvertCatalogTransformers.KeyToValue_Term(); + TransformSamples.MinMaxNormalizer(); } } } diff --git a/docs/samples/Microsoft.ML.Samples/Transformers.cs b/docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs similarity index 95% rename from docs/samples/Microsoft.ML.Samples/Transformers.cs rename to docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs index 67388754ca..297bc36153 100644 --- a/docs/samples/Microsoft.ML.Samples/Transformers.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs @@ -3,7 +3,7 @@ // See the LICENSE file in the project root for more information. // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. -using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Runtime.Api; using Microsoft.ML.Runtime.Data; using Microsoft.ML.StaticPipe; using System; @@ -12,15 +12,15 @@ // NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. // If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference // line by line. -namespace Microsoft.ML.Samples +namespace Microsoft.ML.Samples.Static { - public static class Transformers + public partial class TransformSamples { /// /// The example for the statically typed concat estimator. /// - public static void ConcatEstimator() + public static void ConcatWith() { // Create a new environment for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. diff --git a/docs/samples/Microsoft.ML.Samples/Trainers.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs similarity index 97% rename from docs/samples/Microsoft.ML.Samples/Trainers.cs rename to docs/samples/Microsoft.ML.Samples/Static/SDCA.cs index b89361cbd1..9711aa6b79 100644 --- a/docs/samples/Microsoft.ML.Samples/Trainers.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs @@ -11,9 +11,9 @@ // NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. // If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference // line by line. -namespace Microsoft.ML.Samples +namespace Microsoft.ML.Samples.Static { - public static class Trainers + public partial class LearnersSamples { public static void SdcaRegression() diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs index 5b13a71fa6..d9020d9a8f 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs @@ -21,6 +21,20 @@ public static class NormalizerCatalogExtensions /// The transform catalog /// The column name /// The normalization mode (). + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// public static Normalizer Normalizer(this TransformsCatalog catalog, string columnName, Normalizer.NormalizerMode mode = Runtime.Data.Normalizer.NormalizerMode.MinMax) => new Normalizer(CatalogUtils.GetEnvironment(catalog), columnName, mode); @@ -30,6 +44,19 @@ public static Normalizer Normalizer(this TransformsCatalog catalog, string colum /// The transform catalog /// The normalization mode (). /// The pairs of input and output columns. + /// + /// + /// + /// + /// + /// + /// + /// + /// public static Normalizer Normalizer(this TransformsCatalog catalog, Normalizer.NormalizerMode mode, params (string input, string output)[] columns) => new Normalizer(CatalogUtils.GetEnvironment(catalog), mode, columns); diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index c5edebd7a7..725974fcd9 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -77,5 +77,35 @@ public static IEnumerable GetTopicsData() return data; } + + public class SampleInfertData + { + public int RowNum { get; set; } + public string Education { get; set; } + public float Age { get; set; } + public float Parity { get; set; } + public float Induced { get; set; } + public float Case { get; set; } + + public float Spontaneous { get; set; } + public float Stratum { get; set; } + public float PooledStratum { get; set; } + } + + public static IEnumerable GetInfertData() + { + var data = new List(); + data.Add(new SampleInfertData { + RowNum = 0, Education = "0-5yrs", Age = 26, Parity = 6, Induced = 1, Case = 1, Spontaneous = 2, Stratum = 1, PooledStratum = 3 }); + data.Add(new SampleInfertData { + RowNum = 1, Education = "0-5yrs", Age = 42, Parity = 1, Induced = 1, Case = 1, Spontaneous = 0, Stratum = 2, PooledStratum = 1 }); + data.Add(new SampleInfertData { + RowNum = 2, Education = "0-5yrs", Age = 39, Parity = 6, Induced = 2, Case = 1, Spontaneous = 0, Stratum = 3, PooledStratum = 4 }); + data.Add(new SampleInfertData { + RowNum = 3, Education = "0-5yrs", Age = 34, Parity = 4, Induced = 2, Case = 1, Spontaneous = 0, Stratum = 4, PooledStratum = 2 }); + data.Add(new SampleInfertData { + RowNum = 4, Education = "6-11yrs", Age = 35, Parity = 3, Induced = 1, Case = 1, Spontaneous = 1, Stratum = 5, PooledStratum = 32 }); + return data; + } } } diff --git a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs index b06a3faa73..952f3e01c2 100644 --- a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs +++ b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs @@ -19,6 +19,18 @@ public static class TextTransformCatalog /// The output column /// Advanced transform settings /// + /// + /// + /// + /// + /// + /// + /// + /// /// public static TextTransform FeaturizeText(this TransformsCatalog.TextTransforms catalog, string inputColumn, string outputColumn = null, From f8664d90292c5d2c9985563f5239343bc0a23a2e Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 19 Oct 2018 13:48:03 -0700 Subject: [PATCH 04/13] fixing comments. --- .../Dynamic/KeyToValue_Term.cs | 20 +++++++++---------- .../Dynamic/MinMaxNormalizer.cs | 18 +++++++++-------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index bba0c977df..b79f67f130 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -39,7 +39,7 @@ public static void KeyToValue_Term() var default_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); - // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. + // Another pipeline, that customizes the advanced settings of the TermEstimator. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); @@ -48,7 +48,7 @@ public static void KeyToValue_Term() var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); - // small helper to print the text inside the columns, in the console. + // Small helper to print the text inside the columns, in the console. Action>> printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); @@ -62,9 +62,9 @@ public static void KeyToValue_Term() Console.WriteLine("==================================================="); }; - // Preview of the TextFeatures column obtained after processing the input. - var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); - printHelper(defaultColumnName, defaultColumn); + // Preview of the DefaultKeys column obtained after processing the input. + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); + printHelper(defaultColumnName, defaultColumn); // DefaultKeys column obtained post-transformation. // 1 2 3 4 @@ -76,18 +76,18 @@ public static void KeyToValue_Term() var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); - // CustomizedKeys column obtained post-transformation. + // CustomizedKeys // 6 4 9 3 // 7 4 9 6 // 1 5 9 6 // 2 8 9 6 - // retrieve the original values, by appending the KeyToValue etimator to the existing pipelines - // to convert the keys back to the strings + // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines + // to convert the keys back to the strings. var pipeline = default_pipeline.Append(new KeyToValueEstimator(ml, defaultColumnName)); transformedData_default = pipeline.Fit(trainData).Transform(trainData); - // Preview of the DefaultColumnName column obtained + // Preview of the DefaultColumnName column obtained. var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName); foreach (var row in originalColumnBack) @@ -97,7 +97,7 @@ public static void KeyToValue_Term() Console.WriteLine(""); } - // DefaultColumnName column obtained post-transformation. + // DefaultKeys // radiation galaxy universe duck // space galaxy universe radiation // bus pickup universe radiation diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs index 372eec15c2..3a66d64d23 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -19,7 +19,7 @@ public static void MinMaxNormalizer() // as well as the source of randomness. var ml = new MLContext(seed: 1, conc: 1); - // Get a small dataset as an IEnumerable. + // Get a small dataset as an IEnumerable and convert it to an IDataView. IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); var trainData = ml.CreateStreamingDataView(data); @@ -31,14 +31,14 @@ public static void MinMaxNormalizer() // 34 1 0-5yrs 2 4 2 4 ... // 35 1 6-11yrs 1 3 32 5 ... - // A pipeline for concatenating the age, parity and induced columns together in the Features column + // A pipeline for normalizing the Induced column. var pipeline = ml.Transforms.Normalizer("Induced"); - // The transformed data. + // The transformed (normalized according to Normalizer.NormalizerMode.MinMax) data. var transformedData = pipeline.Fit(trainData).Transform(trainData); - // Getting the data of the newly created column as an Array, and + // Getting the data of the newly created column, so we can preview it. var normalizedColumn = transformedData.GetColumn(ml, "Induced"); - // A small printing utility + // A small printing utility. Action> printHelper = (colName, column) => { Console.WriteLine($"{colName} column obtained post-transformation."); @@ -47,6 +47,8 @@ public static void MinMaxNormalizer() }; printHelper("Induced", normalizedColumn); + + // Preview of the data. // Induced // 0.5 // 0.5 @@ -55,12 +57,12 @@ public static void MinMaxNormalizer() // 0.5 // Composing a different pipeline if we wanted to normalize more than one column at a time. - // A pipeline for concatenating the age, parity and induced columns together in the new columns - // using log scale + // Using log scale as the normalization mode. var multiColPipeline = ml.Transforms.Normalizer(Normalizer.NormalizerMode.LogMeanVariance, new[] { ("Induced", "LogInduced"), ("Spontaneous", "LogSpontaneous") }); // The transformed data. var multiColtransformedData = multiColPipeline.Fit(trainData).Transform(trainData); - // Getting the data of the newly created column as an Array, and + + // Getting the newly created columns. var normalizedInduced = multiColtransformedData.GetColumn(ml, "LogInduced"); var normalizedSpont = multiColtransformedData.GetColumn(ml, "LogSpontaneous"); From 8c8483db78e41e80ce2a6810995e5bb56a80d896 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 19 Oct 2018 13:51:43 -0700 Subject: [PATCH 05/13] fit and finish --- .../Dynamic/ConcatTransform.cs | 4 ++-- .../Dynamic/MinMaxNormalizer.cs | 1 - .../Dynamic/TextTransform.cs | 18 +++++++++--------- .../Transforms/NormalizerCatalog.cs | 4 ++-- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs index cf3d47d17c..79de04ecc2 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs @@ -37,14 +37,14 @@ public static void ConcatTransform() // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... - // A pipeline for concatenating the age, parity and induced columns together in the Features column + // A pipeline for concatenating the age, parity and induced columns together in the Features column. string outputColumnName = "Features"; var pipeline = new ConcatEstimator(ml, outputColumnName, new[] { "Age", "Parity", "Induced"}); // The transformed data. var transformedData = pipeline.Fit(trainData).Transform(trainData); - // Getting the data of the newly created column as an Array, and + // Getting the data of the newly created column as an IEnumerable of SampleInfertDataWithFeatures. var featuresColumn = transformedData.AsEnumerable(ml, reuseRowObject: false); Console.WriteLine($"{outputColumnName} column obtained post-transformation."); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs index 3a66d64d23..ae4814f54c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -83,7 +83,6 @@ public static void MinMaxNormalizer() // 0 // 0 // 0.1586974 - } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs index a5584e6187..fb93c5565f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs @@ -19,7 +19,7 @@ public static void TextTransform() // as well as the source of randomness. var ml = new MLContext(seed: 1, conc: 1); - // Get a small dataset as an IEnumerable. + // Get a small dataset as an IEnumerable and convert to IDataView. IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); var trainData = ml.CreateStreamingDataView(data); @@ -29,8 +29,8 @@ public static void TextTransform() // false ==RUDE== Dude, 2. // true Until the next game, this is the best Xbox game! - // A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "TextFeatures" - // making use of default settings. + // A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "DefaultTextFeatures" + // The pipeline uses the default settings to featurize. string defaultColumnName = "DefaultTextFeatures"; var default_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", defaultColumnName); @@ -44,11 +44,11 @@ public static void TextTransform() s.TextLanguage = Runtime.Data.TextTransform.Language.English; // supports English, French, German, Dutch, Italian, Spanish, Japanese }); - // The transformed data. + // The transformed data for both pipelines. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); - // small helper to print the text inside the columns, in the console. + // Small helper to print the text inside the columns, in the console. Action>> printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); @@ -62,20 +62,20 @@ public static void TextTransform() Console.WriteLine("==================================================="); }; - // Preview of the TextFeatures column obtained after processing the input. + // Preview of the DefaultTextFeatures column obtained after processing the input. var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); - // Transformed data + // DefaultTextFeatures // 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0.2357023 0.2357023 0.2357023 0.2357023 0.4714046 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.5773503 0.5773503 0.5773503 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 - // Preview of the TextFeatures column obtained after processing the input. + // Preview of the CustomizedTextFeatures column obtained after processing the input. var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); - // Transformed data + // CustomizedTextFeatures // 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 } diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs index d9020d9a8f..af0e299e7a 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs @@ -31,7 +31,7 @@ public static class NormalizerCatalogExtensions /// /// /// /// /// @@ -53,7 +53,7 @@ public static Normalizer Normalizer(this TransformsCatalog catalog, string colum /// /// /// /// /// From 3a5490c14c7968118f64f346578c1877e35fb844 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 19 Oct 2018 14:09:05 -0700 Subject: [PATCH 06/13] typo --- src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs index af0e299e7a..0ebad5ced4 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs @@ -44,6 +44,7 @@ public static Normalizer Normalizer(this TransformsCatalog catalog, string colum /// The transform catalog /// The normalization mode (). /// The pairs of input and output columns. + /// /// /// Date: Mon, 22 Oct 2018 23:57:11 -0700 Subject: [PATCH 07/13] merging from master --- .../samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index b79f67f130..c16612a86c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -6,7 +6,7 @@ using Microsoft.ML.Data; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Runtime.Data; - using Microsoft.ML.Transforms; + using Microsoft.ML.Transforms.Text; using System; using System.Collections.Generic; using System.Linq; @@ -36,12 +36,12 @@ public static void KeyToValue_Term() // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") + var default_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse", "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); // Another pipeline, that customizes the advanced settings of the TermEstimator. string customizedColumnName = "CustomizedKeys"; - var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") + var customized_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse", "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); // The transformed data. From 9f84319e20e8731f80c77674cd8c70111b79af2d Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 23 Oct 2018 11:19:20 -0700 Subject: [PATCH 08/13] justin's comments --- .../Dynamic/KeyToValue_Term.cs | 8 ++++-- .../SamplesDatasetUtils.cs | 26 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index b79f67f130..3aa3e0429b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -25,8 +25,9 @@ public static void KeyToValue_Term() IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); var trainData = ml.CreateStreamingDataView(data); - // Preview of the data. - // Review ReviewReverse, Label + // Preview of the topics data; a dataset that contains one column with two set of keys assigned to a body of text + // Review and ReviewReverse. The dataset will be used to classify how accurately the keys are assigned to the text. + // Review, ReviewReverse, Label // "animals birds cats dogs fish horse", "radiation galaxy universe duck", 1 // "horse birds house fish duck cats", "space galaxy universe radiation", 0 // "car truck driver bus pickup", "bus pickup", 1 @@ -40,6 +41,9 @@ public static void KeyToValue_Term() .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); // Another pipeline, that customizes the advanced settings of the TermEstimator. + // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, + // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) + // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 725974fcd9..a8090dba8b 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -25,6 +25,10 @@ public static string DownloadHousingRegressionDataset() return dataFile; } + /// + /// A simple set of features that help generate the Target column, according to a function. + /// Used for the transformers/estimators working on numeric data. + /// public class SampleInput { public float Feature0 { get; set; } @@ -34,6 +38,9 @@ public class SampleInput public float Target { get; set; } } + /// + /// Returns a sample of a numeric dataset. + /// public static IEnumerable GetInputData() { var data = new List(); @@ -44,12 +51,18 @@ public static IEnumerable GetInputData() return data; } + /// + /// A dataset that contains a tweet and the sentiment assigned to that tweet: 0 - negative and 1 - positive sentiment. + /// public class SampleSentimentData { public bool Sentiment { get; set; } public string SentimentText { get; set; } } + /// + /// Returns a sample of the sentiment dataset. + /// public static IEnumerable GetSentimentData() { var data = new List(); @@ -60,6 +73,10 @@ public static IEnumerable GetSentimentData() return data; } + /// + /// A dataset that contains one column with two set of keys assigned to a body of text: Review and ReviewReverse. + /// The dataset will be used to classify how accurately the keys are assigned to the text. + /// public class SampleTopicsData { public string Review { get; set; } @@ -67,6 +84,9 @@ public class SampleTopicsData public bool Label { get; set; } } + /// + /// Returns a sample of the topics dataset. + /// public static IEnumerable GetTopicsData() { var data = new List(); @@ -78,6 +98,9 @@ public static IEnumerable GetTopicsData() return data; } + /// + /// Represents the column of the infertility dataset. + /// public class SampleInfertData { public int RowNum { get; set; } @@ -92,6 +115,9 @@ public class SampleInfertData public float PooledStratum { get; set; } } + /// + /// Returns a few rows of the infertility dataset. + /// public static IEnumerable GetInfertData() { var data = new List(); From 80aa06ddbbc22c91c2dd85018227c2d8e87e9eef Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 23 Oct 2018 11:20:05 -0700 Subject: [PATCH 09/13] spaces are not cool. --- src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index a8090dba8b..46e8a0b460 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -52,7 +52,7 @@ public static IEnumerable GetInputData() } /// - /// A dataset that contains a tweet and the sentiment assigned to that tweet: 0 - negative and 1 - positive sentiment. + /// A dataset that contains a tweet and the sentiment assigned to that tweet: 0 - negative and 1 - positive sentiment. /// public class SampleSentimentData { @@ -99,7 +99,7 @@ public static IEnumerable GetTopicsData() } /// - /// Represents the column of the infertility dataset. + /// Represents the column of the infertility dataset. /// public class SampleInfertData { @@ -116,7 +116,7 @@ public class SampleInfertData } /// - /// Returns a few rows of the infertility dataset. + /// Returns a few rows of the infertility dataset. /// public static IEnumerable GetInfertData() { From e60c1e000da4986ca7bd17a2c21e9cb68c5b9b44 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 23 Oct 2018 14:11:59 -0700 Subject: [PATCH 10/13] updating the datasets entries. --- test/data/README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/test/data/README.md b/test/data/README.md index 61cded9255..6a21ece35f 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -73,6 +73,21 @@ Redistributing the dataset "[housing.txt](housing.txt)" with attribution: More information: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names +### Air Quality + +This dataset is from the R documentation: [New York Air Quality Measurements]https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/airquality.html +The data were obtained from the New York State Department of Conservation (ozone data) and the National Weather Service (meteorological data). +References: Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P. A. (1983) Graphical Methods for Data Analysis. Belmont, CA: Wadsworth. + +The dataset is distributed under [GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) + +### Infertility + +This dataset is from the R documentation: [Infertility after Spontaneous and Induced Abortion]https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/infert.html +Original source: Trichopoulos et al (1976) Br. J. of Obst. and Gynaec. 83, 645–650. + +The dataset is distributed under [GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) + # Images ### Located in `images` folder @@ -81,4 +96,4 @@ More information: https://archive.ics.uci.edu/ml/machine-learning-databases/hous > > "[Hot dog with mustard](https://visualsonline.cancer.gov/details.cfm?imageid=2669)" by Renee Comet is in the public domain - this image was released by the [National Cancer Institute](https://visualsonline.cancer.gov/details.cfm?imageid=2669) > -> "[Bright red tomato and cross section02](https://upload.wikimedia.org/wikipedia/commons/8/88/Bright_red_tomato_and_cross_section02.jpg)" by [fir0002](https://en.wikipedia.org/wiki/User:Fir0002) is licensed under the [CC BY-NC](https://creativecommons.org/licenses/by/2.0/) +> "[Bright red tomato and cross section02](https://upload.wikimedia.org/wikipedia/commons/8/88/Bright_red_tomato_and_cross_section02.jpg)" by [fir0002](https://en.wikipedia.org/wiki/User:Fir0002) is licensed under the [CC BY-NC](https://creativecommons.org/licenses/by/2.0/) \ No newline at end of file From 1ec64892970e4e16ac889c6613735b4694bb4881 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 24 Oct 2018 15:35:11 -0700 Subject: [PATCH 11/13] addressing comments --- .../Microsoft.ML.Samples/Microsoft.ML.Samples.csproj | 1 + docs/samples/Microsoft.ML.Samples/Program.cs | 5 ++++- docs/samples/Microsoft.ML.Samples/Static/FastTree.cs | 12 ++++++------ docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs | 10 +++++----- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 263060abeb..4c2f9385b9 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -13,6 +13,7 @@ + false diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index e967de29cf..ca5437a691 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -2,13 +2,16 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -namespace Microsoft.ML.Samples.Dynamic +using Microsoft.ML.Samples.Dynamic; + +namespace Microsoft.ML.Samples { internal static class Program { static void Main(string[] args) { TransformSamples.MinMaxNormalizer(); + // TrainersSamples.LightGbmRegression(); } } } diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs index f232d9e898..2fde291d7c 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs @@ -37,7 +37,7 @@ public static void FastTreeRegression() separator: '\t', hasHeader: true); // Read the data, and leave 10% out, so we can use them for testing - var data = reader.Read(new MultiFileSource(dataFile)); + var data = reader.Read(dataFile); // The predictor that gets produced out of training FastTreeRegressionPredictor pred = null; @@ -63,11 +63,11 @@ public static void FastTreeRegression() Rms: cvResults.Select(r => r.metrics.Rms).Average(), RSquared: cvResults.Select(r => r.metrics.RSquared).Average() ); - Console.WriteLine($"L1 - {averagedMetrics.L1}"); - Console.WriteLine($"L2 - {averagedMetrics.L2}"); - Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); - Console.WriteLine($"RMS - {averagedMetrics.Rms}"); - Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); + Console.WriteLine($"L1 - {averagedMetrics.L1}"); // 3.091095 + Console.WriteLine($"L2 - {averagedMetrics.L2}"); // 20.351073 + Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); // 20.351074 + Console.WriteLine($"RMS - {averagedMetrics.Rms}"); // 4.478358 + Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); // 0.754977 } } diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs index c6a052c38a..ccc6b4115a 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs @@ -68,11 +68,11 @@ public static void LightGbmRegression() var dataWithPredictions = model.Transform(testData); var metrics = regressionContext.Evaluate(dataWithPredictions, r => r.label, r => r.score); - Console.WriteLine($"L1 - {metrics.L1}"); - Console.WriteLine($"L2 - {metrics.L2}"); - Console.WriteLine($"LossFunction - {metrics.LossFn}"); - Console.WriteLine($"RMS - {metrics.Rms}"); - Console.WriteLine($"RSquared - {metrics.RSquared}"); + Console.WriteLine($"L1 - {metrics.L1}"); // 4.9669731 + Console.WriteLine($"L2 - {metrics.L2}"); // 51.37296 + Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 51.37296 + Console.WriteLine($"RMS - {metrics.Rms}"); // 7.167493 + Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.079478 } } } From 01a327ac8d9435ccb9c5ed571b1271a42f6c6857 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 25 Oct 2018 09:57:10 -0700 Subject: [PATCH 12/13] PR comments --- .../Dynamic/ConcatTransform.cs | 4 +++- .../Dynamic/KeyToValue_Term.cs | 15 ++++++++++----- .../Dynamic/MinMaxNormalizer.cs | 11 +++++++---- .../Microsoft.ML.Samples/Dynamic/TextTransform.cs | 7 +++++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs index 79de04ecc2..4717c808e1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs @@ -30,6 +30,7 @@ public static void ConcatTransform() var trainData = ml.CreateStreamingDataView(data); // Preview of the data. + // // Age Case Education induced parity pooled.stratum row_num ... // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... @@ -55,7 +56,8 @@ public static void ConcatTransform() Console.WriteLine(""); } - // Features + // Features column obtained post-transformation. + // // 26 6 1 // 42 1 1 // 39 6 2 diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index 73b4f9d240..134966e8c0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -25,8 +25,10 @@ public static void KeyToValue_Term() IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); var trainData = ml.CreateStreamingDataView(data); - // Preview of the topics data; a dataset that contains one column with two set of keys assigned to a body of text - // Review and ReviewReverse. The dataset will be used to classify how accurately the keys are assigned to the text. + // Preview of the topics data; a dataset that contains two columns containing keys independently assigned to a body of text, + // Review and ReviewReverse. The Label colum indicates whether the set of keys in the ReviewReverse match the ones in the review column. + // The dataset will be used to classify how accurately the keys are assigned to the text. + // // Review, ReviewReverse, Label // "animals birds cats dogs fish horse", "radiation galaxy universe duck", 1 // "horse birds house fish duck cats", "space galaxy universe radiation", 0 @@ -37,7 +39,7 @@ public static void KeyToValue_Term() // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse", "ReviewReverse") + var default_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse") .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); // Another pipeline, that customizes the advanced settings of the TermEstimator. @@ -71,6 +73,7 @@ public static void KeyToValue_Term() printHelper(defaultColumnName, defaultColumn); // DefaultKeys column obtained post-transformation. + // // 1 2 3 4 // 5 2 3 1 // 6 7 3 1 @@ -80,7 +83,8 @@ public static void KeyToValue_Term() var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); - // CustomizedKeys + // CustomizedKeys column obtained post-transformation. + // // 6 4 9 3 // 7 4 9 6 // 1 5 9 6 @@ -101,7 +105,8 @@ public static void KeyToValue_Term() Console.WriteLine(""); } - // DefaultKeys + // DefaultKeys column obtained post-transformation. + // // radiation galaxy universe duck // space galaxy universe radiation // bus pickup universe radiation diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs index ae4814f54c..793cf09b2d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -24,6 +24,7 @@ public static void MinMaxNormalizer() var trainData = ml.CreateStreamingDataView(data); // Preview of the data. + // // Age Case Education Induced Parity PooledStratum RowNum ... // 26 1 0-5yrs 1 6 3 1 ... // 42 1 0-5yrs 1 1 1 2 ... @@ -48,8 +49,8 @@ public static void MinMaxNormalizer() printHelper("Induced", normalizedColumn); - // Preview of the data. - // Induced + // Induced column obtained post-transformation. + // // 0.5 // 0.5 // 1 @@ -68,7 +69,8 @@ public static void MinMaxNormalizer() printHelper("LogInduced", normalizedInduced); - // LogInduced + // LogInduced column obtained post-transformation. + // // 0.2071445 // 0.2071445 // 0.889631 @@ -77,7 +79,8 @@ public static void MinMaxNormalizer() printHelper("LogSpontaneous", normalizedSpont); - // LogSpontaneous + // LogSpontaneous column obtained post-transformation. + // // 0.8413026 // 0 // 0 diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs index fb93c5565f..b16bb963d5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs @@ -24,6 +24,7 @@ public static void TextTransform() var trainData = ml.CreateStreamingDataView(data); // Preview of the data. + // // Sentiment SentimentText // true Best game I've ever played. // false ==RUDE== Dude, 2. @@ -66,7 +67,8 @@ public static void TextTransform() var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); - // DefaultTextFeatures + // DefaultTextFeatures column obtained post-transformation. + // // 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0.2357023 0.2357023 0.2357023 0.2357023 0.4714046 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.5773503 0.5773503 0.5773503 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 @@ -75,7 +77,8 @@ public static void TextTransform() var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); - // CustomizedTextFeatures + // CustomizedTextFeatures column obtained post-transformation. + // // 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 } From eff8504f7430411980f2bab6d903e5150633b533 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 25 Oct 2018 13:35:53 -0700 Subject: [PATCH 13/13] addressingPR comments re-arranging lines. --- .../Dynamic/ConcatTransform.cs | 3 +- .../Dynamic/KeyToValue_Term.cs | 51 ++++++------- .../Dynamic/MinMaxNormalizer.cs | 2 +- .../Dynamic/TextTransform.cs | 2 +- docs/samples/Microsoft.ML.Samples/Program.cs | 3 +- .../Static/ConcatEstimator.cs | 76 ------------------- .../Microsoft.ML.Samples/Static/FastTree.cs | 16 ++-- .../Microsoft.ML.Samples/Static/LightGBM.cs | 16 ++-- .../Microsoft.ML.Samples/Static/SDCA.cs | 13 ++-- .../Transforms/NormalizerCatalog.cs | 4 +- src/Microsoft.ML.FastTree/FastTreeStatic.cs | 2 +- src/Microsoft.ML.LightGBM/LightGbmStatic.cs | 2 +- .../Standard/SdcaCatalog.cs | 2 +- .../Standard/SdcaStatic.cs | 2 +- .../TextTransformCatalog.cs | 4 +- 15 files changed, 56 insertions(+), 142 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs index 4717c808e1..63c0fd850d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs @@ -5,7 +5,6 @@ // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Api; - using Microsoft.ML.Data; using System; using System.Linq; using System.Collections.Generic; @@ -23,7 +22,7 @@ public static void ConcatTransform() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(seed: 1, conc: 1); + var ml = new MLContext(); // Get a small dataset as an IEnumerable. IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index 134966e8c0..85609c336f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -19,36 +19,35 @@ public static void KeyToValue_Term() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(seed: 1, conc: 1); + var ml = new MLContext(); // Get a small dataset as an IEnumerable. IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); var trainData = ml.CreateStreamingDataView(data); - // Preview of the topics data; a dataset that contains two columns containing keys independently assigned to a body of text, - // Review and ReviewReverse. The Label colum indicates whether the set of keys in the ReviewReverse match the ones in the review column. - // The dataset will be used to classify how accurately the keys are assigned to the text. + // Preview of one of the columns of the the topics data. + // The Review column contains the keys associated with a particular body of text. // - // Review, ReviewReverse, Label - // "animals birds cats dogs fish horse", "radiation galaxy universe duck", 1 - // "horse birds house fish duck cats", "space galaxy universe radiation", 0 - // "car truck driver bus pickup", "bus pickup", 1 - // "car truck driver bus pickup horse", "car truck", 0 + // Review + // "animals birds cats dogs fish horse" + // "horse birds house fish duck cats" + // "car truck driver bus pickup" + // "car truck driver bus pickup horse" - // A pipeline to convert the terms of the review_reverse column in + // A pipeline to convert the terms of the 'Review' column in // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse") - .Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); + var default_pipeline = new WordTokenizeEstimator(ml, "Review") + .Append(new TermEstimator(ml, "Review", defaultColumnName)); // Another pipeline, that customizes the advanced settings of the TermEstimator. // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; - var customized_pipeline = new WordTokenizeEstimator(ml, "ReviewReverse", "ReviewReverse") - .Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); + var customized_pipeline = new WordTokenizeEstimator(ml, "Review") + .Append(new TermEstimator(ml, "Review", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); @@ -74,10 +73,10 @@ public static void KeyToValue_Term() // DefaultKeys column obtained post-transformation. // - // 1 2 3 4 - // 5 2 3 1 - // 6 7 3 1 - // 8 9 3 1 + // 1 2 3 4 5 6 + // 6 2 7 5 8 3 + // 9 10 11 12 13 3 + // 9 10 11 12 13 6 // Previewing the CustomizedKeys column obtained after processing the input. var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); @@ -85,10 +84,10 @@ public static void KeyToValue_Term() // CustomizedKeys column obtained post-transformation. // - // 6 4 9 3 - // 7 4 9 6 - // 1 5 9 6 - // 2 8 9 6 + // 1 2 4 5 7 8 + // 8 2 9 7 6 4 + // 3 10 0 0 0 4 + // 3 10 0 0 0 8 // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines // to convert the keys back to the strings. @@ -107,10 +106,10 @@ public static void KeyToValue_Term() // DefaultKeys column obtained post-transformation. // - // radiation galaxy universe duck - // space galaxy universe radiation - // bus pickup universe radiation - // car truck universe radiation + // animals birds cats dogs fish horse + // horse birds house fish duck cats + // car truck driver bus pickup cats + // car truck driver bus pickup horse } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs index 793cf09b2d..23501b0731 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -17,7 +17,7 @@ public static void MinMaxNormalizer() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(seed: 1, conc: 1); + var ml = new MLContext(); // Get a small dataset as an IEnumerable and convert it to an IDataView. IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs index b16bb963d5..528a96f981 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs @@ -17,7 +17,7 @@ public static void TextTransform() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(seed: 1, conc: 1); + var ml = new MLContext(); // Get a small dataset as an IEnumerable and convert to IDataView. IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index ca5437a691..c2c9ef37a6 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -10,8 +10,7 @@ internal static class Program { static void Main(string[] args) { - TransformSamples.MinMaxNormalizer(); - // TrainersSamples.LightGbmRegression(); + TransformSamples.KeyToValue_Term(); } } } diff --git a/docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs b/docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs deleted file mode 100644 index 297bc36153..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/ConcatEstimator.cs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. - using Microsoft.ML.Runtime.Api; - using Microsoft.ML.Runtime.Data; - using Microsoft.ML.StaticPipe; - using System; - using System.Collections.Generic; - -// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. -// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference -// line by line. -namespace Microsoft.ML.Samples.Static -{ - public partial class TransformSamples - { - - /// - /// The example for the statically typed concat estimator. - /// - public static void ConcatWith() - { - // Create a new environment for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var env = new LocalEnvironment(); - - IEnumerable data = SamplesUtils.DatasetUtils.GetInputData(); - - // A preview of InputData: - // feature_0; feature_1; feature_2; feature_3; target - // -2.75; 0.77; -0.61; 0.14; 140.66 - // -0.61; -0.37; -0.12; 0.55; 148.12 - // -0.85; -0.91; 1.81; 0.02; 402.20 - - // Convert to an DataView. - var trainingData = env.CreateStreamingDataView(data); - - // Convert the IDataView to statically-typed data view, so its schema can be used on the - // pipelines that will get built in top of it. - var staticData = trainingData.AssertStatic(env, c => ( - Feature0: c.R4.Scalar, - Feature1: c.R4.Scalar, - Feature2: c.R4.Scalar, - Feature3: c.R4.Scalar, - Target: c.R4.Scalar)); - - // Start creating our processing pipeline. - // Let just concatenate all the float columns together into one using ConcatWith. - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => ( - r.Target, - Features: r.Feature0.ConcatWith(r.Feature1, r.Feature2, r.Feature3))); - - // Transform the data through the above pipeline. - var transformedData = staticLearningPipeline.Fit(staticData).Transform(staticData); - - // The transformedData DataView is now of the type (Target:Scalar, Features:Vector). - - // Features target - // -2.75 0.77 -0.61 0.14; 140.66 - // -0.61 -0.37 -0.12 0.55; 148.12 - // -0.85 -0.91 1.81 0.02; 402.20 - - // Let's print out the new data. - var features = transformedData.GetColumn(r => r.Features); - - Console.WriteLine("Features column obtained post-transformation."); - foreach (var featureRow in features) - { - Console.WriteLine($"{featureRow[0]} {featureRow[1]} {featureRow[2]} {featureRow[3]}"); - } - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs index 2fde291d7c..639fec5990 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs @@ -4,7 +4,7 @@ // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. using Microsoft.ML.Runtime.Data; - using Microsoft.ML.Runtime.FastTree; + using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.StaticPipe; using System; using System.Linq; @@ -23,14 +23,12 @@ public static void FastTreeRegression() // you can open the file to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( + var reader = TextLoader.CreateReader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), @@ -44,7 +42,7 @@ public static void FastTreeRegression() // Create the estimator var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.FastTree( + .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree( r.label, r.features, numTrees: 100, // try: (int) 20-2000 @@ -55,7 +53,7 @@ public static void FastTreeRegression() ) ); - var cvResults = regressionContext.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); + var cvResults = mlContext.Regression.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); var averagedMetrics = ( L1: cvResults.Select(r => r.metrics.L1).Average(), L2: cvResults.Select(r => r.metrics.L2).Average(), diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs index ccc6b4115a..c9548b03c5 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs @@ -22,14 +22,12 @@ public static void LightGbmRegression() // you can open the file to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( + var reader = TextLoader.CreateReader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), @@ -37,14 +35,14 @@ public static void LightGbmRegression() // Read the data, and leave 10% out, so we can use them for testing var data = reader.Read(new MultiFileSource(dataFile)); - var (trainData, testData) = regressionContext.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LightGbmRegressionPredictor pred = null; // Create the estimator var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.LightGbm( + .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm( r.label, r.features, numLeaves: 4, @@ -66,7 +64,7 @@ public static void LightGbmRegression() // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); - var metrics = regressionContext.Evaluate(dataWithPredictions, r => r.label, r => r.score); + var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); Console.WriteLine($"L1 - {metrics.L1}"); // 4.9669731 Console.WriteLine($"L2 - {metrics.L2}"); // 51.37296 diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs index c86ff28066..f4ac4e9732 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs @@ -23,13 +23,10 @@ public static void SdcaRegression() string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); + var mlContext = new MLContext(); // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( + var reader = TextLoader.CreateReader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), @@ -37,14 +34,14 @@ public static void SdcaRegression() // Read the data, and leave 10% out, so we can use them for testing var data = reader.Read(dataFile); - var (trainData, testData) = regressionContext.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LinearRegressionPredictor pred = null; // Create the estimator var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.Sdca( + .Append(r => (r.label, score: mlContext.Regression.Trainers.Sdca( r.label, r.features, l1Threshold: 0f, @@ -65,7 +62,7 @@ public static void SdcaRegression() // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); - var metrics = regressionContext.Evaluate(dataWithPredictions, r => r.label, r => r.score); + var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); Console.WriteLine($"L1 - {metrics.L1}"); // 3.7226085 Console.WriteLine($"L2 - {metrics.L2}"); // 24.250636 diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs index 0ebad5ced4..dde4619f5b 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs @@ -24,14 +24,14 @@ public static class NormalizerCatalogExtensions /// /// /// /// /// /// /// /// /// /// diff --git a/src/Microsoft.ML.FastTree/FastTreeStatic.cs b/src/Microsoft.ML.FastTree/FastTreeStatic.cs index 979c962858..3040436ec6 100644 --- a/src/Microsoft.ML.FastTree/FastTreeStatic.cs +++ b/src/Microsoft.ML.FastTree/FastTreeStatic.cs @@ -38,7 +38,7 @@ public static class FastTreeRegressionExtensions /// /// /// /// public static Scalar FastTree(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs b/src/Microsoft.ML.LightGBM/LightGbmStatic.cs index 2607f006ef..3f0ad9847a 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmStatic.cs @@ -37,7 +37,7 @@ public static partial class RegressionTrainers /// /// /// /// public static Scalar LightGbm(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs index 35aff21a43..18dcbbad6d 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs @@ -57,7 +57,7 @@ public static class SdcaBinaryClassificationExtensions /// /// /// /// /// diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs index ecbdc3500c..b62b96fc65 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs @@ -37,7 +37,7 @@ public static class SdcaRegressionExtensions /// /// /// /// public static Scalar Sdca(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs index 952f3e01c2..8a1d93fdad 100644 --- a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs +++ b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs @@ -21,14 +21,14 @@ public static class TextTransformCatalog /// /// /// /// /// /// /// /// /// ///