From 7557a835d19a1465c47f4f03d380c980b1d31825 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 6 Feb 2019 14:57:34 -0800 Subject: [PATCH 1/6] Updating docstrings --- .../Dynamic/Transforms/SelectColumns.cs | 41 +++++++++++-------- .../Transforms/ExtensionsCatalog.cs | 33 ++++++++++++--- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs index 2567f036f9..1309b73739 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs @@ -1,5 +1,4 @@ using System; -using System.Collections.Generic; using Microsoft.ML.Data; namespace Microsoft.ML.Samples.Dynamic @@ -13,25 +12,31 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and them read it as ML.NET's data type. - IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); - var trainData = mlContext.Data.ReadFromEnumerable(data); + var enumerableData = SamplesUtils.DatasetUtils.GetInfertData(); + var data = mlContext.Data.ReadFromEnumerable(enumerableData); - // Preview of the data. - // - // Age Case Education induced parity pooled.stratum row_num ... - // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... - // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... - // 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... - // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... - // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... + // Before transformation, take a look at the dataset + Console.WriteLine($"Age\tCase\tEducation\tInduced\tParity\tPooledStratum"); + foreach (var row in enumerableData) + { + Console.WriteLine($"{row.Age}\t{row.Case}\t{row.Education}\t{row.Induced}\t{row.Parity}\t{row.PooledStratum}"); + } + Console.WriteLine(); + // Expected output: + // Age Case Education Induced Parity PooledStratum + // 26 1 0 - 5yrs 1 6 3 + // 42 1 0 - 5yrs 1 1 1 + // 39 1 12 + yrs 2 6 4 + // 34 1 0 - 5yrs 2 4 2 + // 35 1 6 - 11yrs 1 3 32 // Select a subset of columns to keep. - var pipeline = mlContext.Transforms.SelectColumns(new string[] { "Age", "Education" }); + var pipeline = mlContext.Transforms.SelectColumns("Age", "Education"); // Now we can transform the data and look at the output to confirm the behavior of CopyColumns. // Don't forget that this operation doesn't actually evaluate data until we read the data below, // as transformations are lazy in ML.NET. - var transformedData = pipeline.Fit(trainData).Transform(trainData); + var transformedData = pipeline.Fit(data).Transform(data); // Print the number of columns in the schema Console.WriteLine($"There are {transformedData.Schema.Count} columns in the dataset."); @@ -51,11 +56,11 @@ public static void Example() // Expected output: // Age and Education columns obtained post-transformation. - // Age: 26 Education: 0 - 5yrs - // Age: 42 Education: 0 - 5yrs - // Age: 39 Education: 0 - 5yrs - // Age: 34 Education: 0 - 5yrs - // Age: 35 Education: 6 - 11yrs + // Age: 26 Education: 0-5yrs + // Age: 42 Education: 0-5yrs + // Age: 39 Education: 12+yrs + // Age: 34 Education: 0-5yrs + // Age: 35 Education: 6-11yrs } private class SampleInfertDataTransformed diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index 80f9a67f10..618cb1c7ef 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.Data.DataView; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -67,9 +68,9 @@ public static ColumnConcatenatingEstimator Concatenate(this TransformsCatalog ca /// /// is commonly used to remove unwanted columns from the schema if the dataset is going to be serialized or /// written out to a file. It is not actually necessary to drop unused columns before training or - /// performing transforms, as IDataView's lazy evaluation won't actually materialize those columns. + /// performing transforms, as 's lazy evaluation won't actually materialize those columns. /// In the case of serialization, every column in the schema will be written out. If you have columns - /// that you don't want to save, you can use DropColumns to remove them from the schema. + /// that you don't want to save, you can use to remove them from the schema. /// /// The transform's catalog. /// The array of column names to drop. @@ -84,11 +85,11 @@ public static ColumnSelectingEstimator DropColumns(this TransformsCatalog catalo => ColumnSelectingEstimator.DropColumns(CatalogUtils.GetEnvironment(catalog), columnsToDrop); /// - /// ColumnSelectingEstimator is used to select a list of columns that user wants to keep from a given input. + /// Select a list of columns to keep in a given . /// /// /// - /// operates on the schema of an input IDataView, + /// operates on the schema of an input , /// either dropping unselected columns from the schema or keeping them but marking them as hidden in the schema. Keeping columns hidden /// is recommended when it is necessary to understand how the inputs of a pipeline map to outputs of the pipeline. This feature /// is useful, for example, in debugging a pipeline of transforms by allowing you to print out results from the middle of the pipeline. @@ -97,7 +98,7 @@ public static ColumnSelectingEstimator DropColumns(this TransformsCatalog catalo /// /// The transform's catalog. /// The array of column names to keep. - /// If true will keep hidden columns and false will remove hidden columns. + /// If will keep hidden columns and will remove hidden columns. /// /// /// public static ColumnSelectingEstimator SelectColumns(this TransformsCatalog catalog, string[] keepColumns, - bool keepHidden = ColumnSelectingTransformer.Defaults.KeepHidden) + bool keepHidden) => new ColumnSelectingEstimator(CatalogUtils.GetEnvironment(catalog), keepColumns, null, keepHidden, ColumnSelectingTransformer.Defaults.IgnoreMissing); + + /// + /// Select a list of columns to keep in a given . + /// + /// + /// + /// operates on the schema of an input , dropping unselected columns from the schema. + /// + /// + /// The transform's catalog. + /// The array of column names to keep. + /// + /// + /// + /// + /// + public static ColumnSelectingEstimator SelectColumns(this TransformsCatalog catalog, + params string[] keepColumns) => catalog.SelectColumns(keepColumns, false); } } From 4829f7a568ae44f8167cccd6e1e7f09d6184341b Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 12 Feb 2019 15:46:11 -0800 Subject: [PATCH 2/6] Adding functional tests for DataIO --- test/Microsoft.ML.Functional.Tests/Common.cs | 33 +- test/Microsoft.ML.Functional.Tests/DataIO.cs | 353 +++++++++++++++++++ 2 files changed, 384 insertions(+), 2 deletions(-) create mode 100644 test/Microsoft.ML.Functional.Tests/DataIO.cs diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 29088298d3..e459170c01 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -2,10 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; -using Microsoft.ML.SamplesUtils; -using Microsoft.ML.Trainers.HalLearners; using Xunit; namespace Microsoft.ML.Functional.Tests @@ -20,5 +20,34 @@ public static void CheckMetrics(RegressionMetrics metrics) Assert.True(metrics.L2 >= 0); Assert.True(metrics.RSquared <= 1); } + + public static void AssertEqual(float[] array1, float[] array2) + { + Assert.NotNull(array1); + Assert.NotNull(array2); + Assert.Equal(array1.Length, array2.Length); + + for (int i = 0; i < array1.Length; i++) + Assert.Equal(array1[i], array2[i]); + } + + public static void AssertEqual(Schema schema1, Schema schema2) + { + Assert.NotNull(schema1); + Assert.NotNull(schema2); + + Assert.Equal(schema1.Count(), schema2.Count()); + + foreach (var schemaPair in schema1.Zip(schema2, Tuple.Create)) + { + Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name); + Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index); + Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden); + // Can probably do a better comparison of Metadata + AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema); + Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) || + (schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType)); + } + } } } diff --git a/test/Microsoft.ML.Functional.Tests/DataIO.cs b/test/Microsoft.ML.Functional.Tests/DataIO.cs new file mode 100644 index 0000000000..95e2d4877b --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/DataIO.cs @@ -0,0 +1,353 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.TestFramework; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + /// + /// Test data input and output formats + /// + public class DataIO : BaseTestClass + { + // Separators to test + private readonly char[] _separators; + + public DataIO(ITestOutputHelper output) : base(output) + { + // SaveAsText expects a "space, tab, comma, semicolon, or bar" + _separators = new char[] { ' ', '\t', ',', ';', '|', }; + } + + /// + /// Read from Enumerable: In-Memory objects can be read as enumerables into an IDatView. + /// + [Fact] + public void ReadFromIEnumerable() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Read the dataset from an enumerable + var data = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + + ValidateToyDataset(data); + } + + /// + /// Export to Enumerable: IDatViews can be exported as enumerables of a class. + /// + [Fact] + public void ExportToIEnumerable() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Read the dataset from an enumerable + var enumerableBefore = GenerateToyDataset(); + var data = mlContext.Data.ReadFromEnumerable(enumerableBefore); + + // Export back to an enumerable + var enumerableAfter = mlContext.CreateEnumerable(data, true); + + AssertEqual(enumerableBefore, enumerableAfter); + } + + /// + /// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file. + /// + /// + /// Tests the roundtrip hrough a file using explicit schematization. + /// + [Fact] + public void WriteToAndReadFromADelimetedFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + + foreach (var separator in _separators) + { + // Serialize a dataset with a known schema to a file + var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); + var dataAfter = ToyDataset.GetTextLoader(mlContext, separator).Read(filePath); + ValidateToyDataset(dataAfter); + ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + } + + /// + /// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file. + /// + /// + /// Tests the roundtrip hrough a file using schema inference. + /// + [Fact] + public void WriteToAndReadASchemaFromADelimitedFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + + foreach (var separator in _separators) + { + // Serialize a dataset with a known schema to a file + var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); + var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); + ValidateToyDataset(dataAfter); + ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + } + + /// + /// Wrie to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file. + /// + [Fact] + public void WriteAndReadAFromABinaryFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + + // Serialize a dataset with a known schema to a file + var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore); + var dataAfter = mlContext.Data.ReadFromBinary(filePath); + ValidateToyDataset(dataAfter); + ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + + #region FileIO + private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator) + { + var filePath = GetOutputPath(Path.GetRandomFileName()); + using (var file = File.Create(filePath)) + mlContext.Data.SaveAsText(data, file, separatorChar: separator, headerRow: true); + + return filePath; + } + + private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data) + { + var filePath = GetOutputPath(Path.GetRandomFileName()); + using (var file = File.Create(filePath)) + mlContext.Data.SaveAsBinary(data, file); + + return filePath; + } + #endregion + + #region ToyDataset + private void ToyDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) + { + // Validate that the two Schemas are the same + Common.AssertEqual(data1.Schema, data2.Schema); + + // Define how to serialize the IDataView to objects + var enumerable1 = mlContext.CreateEnumerable(data1, true); + var enumerable2 = mlContext.CreateEnumerable(data2, true); + + AssertEqual(enumerable1, enumerable2); + } + + private void AssertEqual(IEnumerable data1, IEnumerable data2) + { + Assert.NotNull(data1); + Assert.NotNull(data2); + Assert.Equal(data1.Count(), data2.Count()); + + foreach (var rowPair in data1.Zip(data2, Tuple.Create)) + { + AssertEqual(rowPair.Item1, rowPair.Item2); + } + } + + private void ValidateToyDataset(IDataView toyDataset) + { + var toyClassProperties = typeof(ToyDataset).GetProperties(); + + // Check that the schema is of the right size + Assert.Equal(17, toyDataset.Schema.Count); + + // Create a lookup table for the types and counts of all properties + var types = new Dictionary(); + var counts = new Dictionary(); + foreach (var property in toyClassProperties) + { + if (!property.PropertyType.IsArray) + types[property.Name] = property.PropertyType; + else + { + // Construct a VBuffer type for the array + var vBufferType = typeof(VBuffer<>); + Type[] typeArgs = { property.PropertyType.GetElementType() }; + Activator.CreateInstance(property.PropertyType.GetElementType()); + types[property.Name] = vBufferType.MakeGenericType(typeArgs); + } + + counts[property.Name] = 0; + } + + foreach (var column in toyDataset.Schema) + { + Assert.True(types.ContainsKey(column.Name)); + Assert.Equal(1, ++counts[column.Name]); + Assert.Equal(types[column.Name], column.Type.RawType); + } + + // Make sure we didn't miss any columns + foreach (var value in counts.Values) + Assert.Equal(1, value); + } + + private IEnumerable GenerateToyDataset(int numExamples = 5, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + yield return new ToyDataset + { + Label = rng.NextDouble() > 0.5, + Features = new float[] { + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble() + }, + I1 = (sbyte)rng.Next(), + U1 = (byte)rng.Next(), + I2 = (short)rng.Next(), + U2 = (ushort)rng.Next(), + I4 = rng.Next(), + U4 = (uint)rng.Next(), + I8 = (long)rng.Next(), + U8 = (ulong)rng.Next(), + R4 = (float)rng.NextDouble(), + R8 = (double)rng.NextDouble(), + Tx = GetRandomRomChar(rng), + Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1+rng.Next())), + Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), + Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), + Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) + }; + } + } + + private ReadOnlyMemory GetRandomRomChar(Random rng, int length = 10) + { + var chars = new char[length]; + for (int i = 0; i < length; i++) + chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~ + return new ReadOnlyMemory(chars); + } + + private sealed class ToyDataset + { + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1, 5), VectorType(5)] + public float[] Features { get; set; } + + [LoadColumn(6)] + public sbyte I1 { get; set; } + + [LoadColumn(7)] + public byte U1 { get; set; } + + [LoadColumn(8)] + public short I2 { get; set; } + + [LoadColumn(9)] + public ushort U2 { get; set; } + + [LoadColumn(10)] + public int I4 { get; set; } + + [LoadColumn(11)] + public uint U4 { get; set; } + + [LoadColumn(12)] + public long I8 { get; set; } + + [LoadColumn(13)] + public ulong U8 { get; set; } + + [LoadColumn(14)] + public float R4 { get; set; } + + [LoadColumn(15)] + public double R8 { get; set; } + + [LoadColumn(16)] + public ReadOnlyMemory Tx { get; set; } + + [LoadColumn(17)] + public TimeSpan Ts { get; set; } + + [LoadColumn(18)] + public DateTime Dt { get; set; } + + [LoadColumn(19)] + public DateTimeOffset Dz { get; set; } + + [LoadColumn(20)] + public RowId Ug { get; set; } + + public static TextLoader GetTextLoader(MLContext mlContext, char separator) + { + return mlContext.Data.CreateTextLoader( + new[] { + new TextLoader.Column("Label", DataKind.Bool, 0), + new TextLoader.Column("Features", DataKind.R4, 1, 5), + new TextLoader.Column("I1", DataKind.I1, 6), + new TextLoader.Column("U1", DataKind.U1, 7), + new TextLoader.Column("I2", DataKind.I2, 8), + new TextLoader.Column("U2", DataKind.U2, 9), + new TextLoader.Column("I4", DataKind.I4, 10), + new TextLoader.Column("U4", DataKind.U4, 11), + new TextLoader.Column("I8", DataKind.I8, 12), + new TextLoader.Column("U8", DataKind.U8, 13), + new TextLoader.Column("R4", DataKind.R4, 14), + new TextLoader.Column("R8", DataKind.R8, 15), + new TextLoader.Column("Tx", DataKind.TX, 16), + new TextLoader.Column("Ts", DataKind.TS, 17), + new TextLoader.Column("Dt", DataKind.DT, 18), + new TextLoader.Column("Dz", DataKind.DZ, 19), + new TextLoader.Column("Ug", DataKind.UG, 20), + }, + hasHeader: true, + separatorChar: separator); + } + } + + private static void AssertEqual(ToyDataset toyDataset1, ToyDataset toyDataset2) + { + Assert.Equal(toyDataset1.Label, toyDataset2.Label); + Common.AssertEqual(toyDataset1.Features, toyDataset2.Features); + Assert.Equal(toyDataset1.I1, toyDataset2.I1); + Assert.Equal(toyDataset1.U1, toyDataset2.U1); + Assert.Equal(toyDataset1.I2, toyDataset2.I2); + Assert.Equal(toyDataset1.U2, toyDataset2.U2); + Assert.Equal(toyDataset1.I4, toyDataset2.I4); + Assert.Equal(toyDataset1.U4, toyDataset2.U4); + Assert.Equal(toyDataset1.I8, toyDataset2.I8); + Assert.Equal(toyDataset1.U8, toyDataset2.U8); + Assert.Equal(toyDataset1.R4, toyDataset2.R4); + Assert.Equal(toyDataset1.R8, toyDataset2.R8); + Assert.Equal(toyDataset1.Tx.ToString(), toyDataset2.Tx.ToString()); + Assert.True(toyDataset1.Ts.Equals(toyDataset2.Ts)); + Assert.True(toyDataset1.Dt.Equals(toyDataset2.Dt)); + Assert.True(toyDataset1.Dz.Equals(toyDataset2.Dz)); + Assert.True(toyDataset1.Ug.Equals(toyDataset2.Ug)); + } + #endregion + } +} From 4da79e2a811337080d8255b6cd46c5724c31d630 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 12 Feb 2019 15:48:28 -0800 Subject: [PATCH 3/6] Remove Tests FileSaver API test. --- .../Api/Estimators/FileBasedSavingOfData.cs | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs deleted file mode 100644 index f1abe56543..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs +++ /dev/null @@ -1,49 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.IO; -using Microsoft.ML.Data; -using Microsoft.ML.Data.IO; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// File-based saving of data: Come up with transform pipeline. Transform training and - /// test data, and save the featurized data to some file, using the .idv format. - /// Train and evaluate multiple models over that pre-featurized data. (Useful for - /// sweeping scenarios, where you are training many times on the same data, - /// and don't necessarily want to transform it every single time.) - /// - [Fact] - void FileBasedSavingOfData() - { - - var ml = new MLContext(seed: 1, conc: 1); - var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var trainData = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Append(ml.Transforms.Text.FeaturizeText("Features", "SentimentText")) - .Fit(src).Read(src); - - var path = DeleteOutputPath("i.idv"); - using (var file = File.Create(path)) - { - var saver = new BinarySaver(ml, new BinarySaver.Arguments()); - using (var ch = ((IHostEnvironment)ml).Start("SaveData")) - DataSaverUtils.SaveDataView(ch, saver, trainData, file); - } - - var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { NumThreads = 1 }); - var loadedTrainData = new BinaryLoader(ml, new BinaryLoader.Arguments(), new MultiFileSource(path)); - - // Train. - var model = trainer.Fit(loadedTrainData); - } - } -} From aaf2a91d7b8ed7a8a24d5967f7a7eea349fe340f Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 12 Feb 2019 17:40:26 -0800 Subject: [PATCH 4/6] Addresing PR comments --- test/Microsoft.ML.Functional.Tests/Common.cs | 137 +++++++++- test/Microsoft.ML.Functional.Tests/DataIO.cs | 258 ++---------------- .../Datasets/AllTypes.cs | 148 ++++++++++ 3 files changed, 300 insertions(+), 243 deletions(-) create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index e459170c01..c6515b3dca 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -3,24 +3,86 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; using Xunit; namespace Microsoft.ML.Functional.Tests { internal static class Common { - public static void CheckMetrics(RegressionMetrics metrics) + /// + /// Asssert that an rows are of . + /// + /// An . + public static void AssertAllTypesDataset(IDataView allTypesDataset) { - // Perform sanity checks on the metrics - Assert.True(metrics.Rms >= 0); - Assert.True(metrics.L1 >= 0); - Assert.True(metrics.L2 >= 0); - Assert.True(metrics.RSquared <= 1); + var toyClassProperties = typeof(AllTypes).GetProperties(); + + // Check that the schema is of the right size. + Assert.Equal(toyClassProperties.Length, allTypesDataset.Schema.Count); + + // Create a lookup table for the types and counts of all properties. + var types = new Dictionary(); + var counts = new Dictionary(); + foreach (var property in toyClassProperties) + { + if (!property.PropertyType.IsArray) + types[property.Name] = property.PropertyType; + else + { + // Construct a VBuffer type for the array. + var vBufferType = typeof(VBuffer<>); + Type[] typeArgs = { property.PropertyType.GetElementType() }; + Activator.CreateInstance(property.PropertyType.GetElementType()); + types[property.Name] = vBufferType.MakeGenericType(typeArgs); + } + + counts[property.Name] = 0; + } + + foreach (var column in allTypesDataset.Schema) + { + Assert.True(types.ContainsKey(column.Name)); + Assert.Equal(1, ++counts[column.Name]); + Assert.Equal(types[column.Name], column.Type.RawType); + } + + // Make sure we didn't miss any columns. + foreach (var value in counts.Values) + Assert.Equal(1, value); } + /// + /// Assert than two datasets are equal. + /// + /// The ML Context. + /// A of + /// A of + public static void AssertAllTypesDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) + { + // Confirm that they are both of the propery row type. + AssertAllTypesDataset(data1); + AssertAllTypesDataset(data2); + + // Validate that the two Schemas are the same. + Common.AssertEqual(data1.Schema, data2.Schema); + + // Define how to serialize the IDataView to objects. + var enumerable1 = mlContext.CreateEnumerable(data1, true); + var enumerable2 = mlContext.CreateEnumerable(data2, true); + + AssertEqual(enumerable1, enumerable2); + } + + /// + /// Assert that two float arrays are equal. + /// + /// An array of floats. + /// An array of floats. public static void AssertEqual(float[] array1, float[] array2) { Assert.NotNull(array1); @@ -31,6 +93,11 @@ public static void AssertEqual(float[] array1, float[] array2) Assert.Equal(array1[i], array2[i]); } + /// + /// Assert that two objects are equal. + /// + /// A object. + /// A object. public static void AssertEqual(Schema schema1, Schema schema2) { Assert.NotNull(schema1); @@ -43,11 +110,67 @@ public static void AssertEqual(Schema schema1, Schema schema2) Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name); Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index); Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden); - // Can probably do a better comparison of Metadata + // Can probably do a better comparison of Metadata. AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema); Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) || (schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType)); } } + + /// + /// Assert than two enumerables are equal. + /// + /// An enumerable of + /// An enumerable of + public static void AssertEqual(IEnumerable data1, IEnumerable data2) + { + Assert.NotNull(data1); + Assert.NotNull(data2); + Assert.Equal(data1.Count(), data2.Count()); + + foreach (var rowPair in data1.Zip(data2, Tuple.Create)) + { + AssertEqual(rowPair.Item1, rowPair.Item2); + } + } + + /// + /// Assert that two AllTypes datasets are equal. + /// + /// An . + /// An . + public static void AssertEqual(AllTypes allTypes1, AllTypes allTypes2) + { + Assert.Equal(allTypes1.Label, allTypes2.Label); + Common.AssertEqual(allTypes1.Features, allTypes2.Features); + Assert.Equal(allTypes1.I1, allTypes2.I1); + Assert.Equal(allTypes1.U1, allTypes2.U1); + Assert.Equal(allTypes1.I2, allTypes2.I2); + Assert.Equal(allTypes1.U2, allTypes2.U2); + Assert.Equal(allTypes1.I4, allTypes2.I4); + Assert.Equal(allTypes1.U4, allTypes2.U4); + Assert.Equal(allTypes1.I8, allTypes2.I8); + Assert.Equal(allTypes1.U8, allTypes2.U8); + Assert.Equal(allTypes1.R4, allTypes2.R4); + Assert.Equal(allTypes1.R8, allTypes2.R8); + Assert.Equal(allTypes1.Tx.ToString(), allTypes2.Tx.ToString()); + Assert.True(allTypes1.Ts.Equals(allTypes2.Ts)); + Assert.True(allTypes1.Dt.Equals(allTypes2.Dt)); + Assert.True(allTypes1.Dz.Equals(allTypes2.Dz)); + Assert.True(allTypes1.Ug.Equals(allTypes2.Ug)); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(RegressionMetrics metrics) + { + // Perform sanity checks on the metrics. + Assert.True(metrics.Rms >= 0); + Assert.True(metrics.L1 >= 0); + Assert.True(metrics.L2 >= 0); + Assert.True(metrics.RSquared <= 1); + } } } diff --git a/test/Microsoft.ML.Functional.Tests/DataIO.cs b/test/Microsoft.ML.Functional.Tests/DataIO.cs index 95e2d4877b..8b50d50102 100644 --- a/test/Microsoft.ML.Functional.Tests/DataIO.cs +++ b/test/Microsoft.ML.Functional.Tests/DataIO.cs @@ -2,12 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; -using System.Collections.Generic; using System.IO; -using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.TestFramework; using Xunit; using Xunit.Abstractions; @@ -15,7 +13,7 @@ namespace Microsoft.ML.Functional.Tests { /// - /// Test data input and output formats + /// Test data input and output formats. /// public class DataIO : BaseTestClass { @@ -24,7 +22,7 @@ public class DataIO : BaseTestClass public DataIO(ITestOutputHelper output) : base(output) { - // SaveAsText expects a "space, tab, comma, semicolon, or bar" + // SaveAsText expects a "space, tab, comma, semicolon, or bar". _separators = new char[] { ' ', '\t', ',', ';', '|', }; } @@ -36,10 +34,10 @@ public void ReadFromIEnumerable() { var mlContext = new MLContext(seed: 1, conc: 1); - // Read the dataset from an enumerable - var data = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + // Read the dataset from an enumerable. + var data = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); - ValidateToyDataset(data); + Common.AssertAllTypesDataset(data); } /// @@ -50,14 +48,14 @@ public void ExportToIEnumerable() { var mlContext = new MLContext(seed: 1, conc: 1); - // Read the dataset from an enumerable - var enumerableBefore = GenerateToyDataset(); + // Read the dataset from an enumerable. + var enumerableBefore = AllTypes.GenerateDataset(); var data = mlContext.Data.ReadFromEnumerable(enumerableBefore); - // Export back to an enumerable - var enumerableAfter = mlContext.CreateEnumerable(data, true); + // Export back to an enumerable. + var enumerableAfter = mlContext.CreateEnumerable(data, true); - AssertEqual(enumerableBefore, enumerableAfter); + Common.AssertEqual(enumerableBefore, enumerableAfter); } /// @@ -71,15 +69,14 @@ public void WriteToAndReadFromADelimetedFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); foreach (var separator in _separators) { - // Serialize a dataset with a known schema to a file + // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); - var dataAfter = ToyDataset.GetTextLoader(mlContext, separator).Read(filePath); - ValidateToyDataset(dataAfter); - ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + var dataAfter = AllTypes.GetTextLoader(mlContext, separator).Read(filePath); + Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); } } @@ -94,15 +91,14 @@ public void WriteToAndReadASchemaFromADelimitedFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); foreach (var separator in _separators) { - // Serialize a dataset with a known schema to a file + // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); - var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); - ValidateToyDataset(dataAfter); - ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); + Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); } } @@ -114,16 +110,14 @@ public void WriteAndReadAFromABinaryFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(GenerateToyDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); - // Serialize a dataset with a known schema to a file + // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore); var dataAfter = mlContext.Data.ReadFromBinary(filePath); - ValidateToyDataset(dataAfter); - ToyDatasetsAreEqual(mlContext, dataBefore, dataAfter); + Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); } - #region FileIO private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator) { var filePath = GetOutputPath(Path.GetRandomFileName()); @@ -141,213 +135,5 @@ private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data) return filePath; } - #endregion - - #region ToyDataset - private void ToyDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) - { - // Validate that the two Schemas are the same - Common.AssertEqual(data1.Schema, data2.Schema); - - // Define how to serialize the IDataView to objects - var enumerable1 = mlContext.CreateEnumerable(data1, true); - var enumerable2 = mlContext.CreateEnumerable(data2, true); - - AssertEqual(enumerable1, enumerable2); - } - - private void AssertEqual(IEnumerable data1, IEnumerable data2) - { - Assert.NotNull(data1); - Assert.NotNull(data2); - Assert.Equal(data1.Count(), data2.Count()); - - foreach (var rowPair in data1.Zip(data2, Tuple.Create)) - { - AssertEqual(rowPair.Item1, rowPair.Item2); - } - } - - private void ValidateToyDataset(IDataView toyDataset) - { - var toyClassProperties = typeof(ToyDataset).GetProperties(); - - // Check that the schema is of the right size - Assert.Equal(17, toyDataset.Schema.Count); - - // Create a lookup table for the types and counts of all properties - var types = new Dictionary(); - var counts = new Dictionary(); - foreach (var property in toyClassProperties) - { - if (!property.PropertyType.IsArray) - types[property.Name] = property.PropertyType; - else - { - // Construct a VBuffer type for the array - var vBufferType = typeof(VBuffer<>); - Type[] typeArgs = { property.PropertyType.GetElementType() }; - Activator.CreateInstance(property.PropertyType.GetElementType()); - types[property.Name] = vBufferType.MakeGenericType(typeArgs); - } - - counts[property.Name] = 0; - } - - foreach (var column in toyDataset.Schema) - { - Assert.True(types.ContainsKey(column.Name)); - Assert.Equal(1, ++counts[column.Name]); - Assert.Equal(types[column.Name], column.Type.RawType); - } - - // Make sure we didn't miss any columns - foreach (var value in counts.Values) - Assert.Equal(1, value); - } - - private IEnumerable GenerateToyDataset(int numExamples = 5, int seed = 1) - { - var rng = new Random(seed); - for (int i = 0; i < numExamples; i++) - { - yield return new ToyDataset - { - Label = rng.NextDouble() > 0.5, - Features = new float[] { - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble() - }, - I1 = (sbyte)rng.Next(), - U1 = (byte)rng.Next(), - I2 = (short)rng.Next(), - U2 = (ushort)rng.Next(), - I4 = rng.Next(), - U4 = (uint)rng.Next(), - I8 = (long)rng.Next(), - U8 = (ulong)rng.Next(), - R4 = (float)rng.NextDouble(), - R8 = (double)rng.NextDouble(), - Tx = GetRandomRomChar(rng), - Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1+rng.Next())), - Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), - Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), - Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) - }; - } - } - - private ReadOnlyMemory GetRandomRomChar(Random rng, int length = 10) - { - var chars = new char[length]; - for (int i = 0; i < length; i++) - chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~ - return new ReadOnlyMemory(chars); - } - - private sealed class ToyDataset - { - [LoadColumn(0)] - public bool Label { get; set; } - - [LoadColumn(1, 5), VectorType(5)] - public float[] Features { get; set; } - - [LoadColumn(6)] - public sbyte I1 { get; set; } - - [LoadColumn(7)] - public byte U1 { get; set; } - - [LoadColumn(8)] - public short I2 { get; set; } - - [LoadColumn(9)] - public ushort U2 { get; set; } - - [LoadColumn(10)] - public int I4 { get; set; } - - [LoadColumn(11)] - public uint U4 { get; set; } - - [LoadColumn(12)] - public long I8 { get; set; } - - [LoadColumn(13)] - public ulong U8 { get; set; } - - [LoadColumn(14)] - public float R4 { get; set; } - - [LoadColumn(15)] - public double R8 { get; set; } - - [LoadColumn(16)] - public ReadOnlyMemory Tx { get; set; } - - [LoadColumn(17)] - public TimeSpan Ts { get; set; } - - [LoadColumn(18)] - public DateTime Dt { get; set; } - - [LoadColumn(19)] - public DateTimeOffset Dz { get; set; } - - [LoadColumn(20)] - public RowId Ug { get; set; } - - public static TextLoader GetTextLoader(MLContext mlContext, char separator) - { - return mlContext.Data.CreateTextLoader( - new[] { - new TextLoader.Column("Label", DataKind.Bool, 0), - new TextLoader.Column("Features", DataKind.R4, 1, 5), - new TextLoader.Column("I1", DataKind.I1, 6), - new TextLoader.Column("U1", DataKind.U1, 7), - new TextLoader.Column("I2", DataKind.I2, 8), - new TextLoader.Column("U2", DataKind.U2, 9), - new TextLoader.Column("I4", DataKind.I4, 10), - new TextLoader.Column("U4", DataKind.U4, 11), - new TextLoader.Column("I8", DataKind.I8, 12), - new TextLoader.Column("U8", DataKind.U8, 13), - new TextLoader.Column("R4", DataKind.R4, 14), - new TextLoader.Column("R8", DataKind.R8, 15), - new TextLoader.Column("Tx", DataKind.TX, 16), - new TextLoader.Column("Ts", DataKind.TS, 17), - new TextLoader.Column("Dt", DataKind.DT, 18), - new TextLoader.Column("Dz", DataKind.DZ, 19), - new TextLoader.Column("Ug", DataKind.UG, 20), - }, - hasHeader: true, - separatorChar: separator); - } - } - - private static void AssertEqual(ToyDataset toyDataset1, ToyDataset toyDataset2) - { - Assert.Equal(toyDataset1.Label, toyDataset2.Label); - Common.AssertEqual(toyDataset1.Features, toyDataset2.Features); - Assert.Equal(toyDataset1.I1, toyDataset2.I1); - Assert.Equal(toyDataset1.U1, toyDataset2.U1); - Assert.Equal(toyDataset1.I2, toyDataset2.I2); - Assert.Equal(toyDataset1.U2, toyDataset2.U2); - Assert.Equal(toyDataset1.I4, toyDataset2.I4); - Assert.Equal(toyDataset1.U4, toyDataset2.U4); - Assert.Equal(toyDataset1.I8, toyDataset2.I8); - Assert.Equal(toyDataset1.U8, toyDataset2.U8); - Assert.Equal(toyDataset1.R4, toyDataset2.R4); - Assert.Equal(toyDataset1.R8, toyDataset2.R8); - Assert.Equal(toyDataset1.Tx.ToString(), toyDataset2.Tx.ToString()); - Assert.True(toyDataset1.Ts.Equals(toyDataset2.Ts)); - Assert.True(toyDataset1.Dt.Equals(toyDataset2.Dt)); - Assert.True(toyDataset1.Dz.Equals(toyDataset2.Dz)); - Assert.True(toyDataset1.Ug.Equals(toyDataset2.Ug)); - } - #endregion } } diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs b/test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs new file mode 100644 index 0000000000..2eed366a0e --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs @@ -0,0 +1,148 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Data.DataView; +using Microsoft.ML; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + internal sealed class AllTypes + { + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1, 5), VectorType(5)] + public float[] Features { get; set; } + + [LoadColumn(6)] + public sbyte I1 { get; set; } + + [LoadColumn(7)] + public byte U1 { get; set; } + + [LoadColumn(8)] + public short I2 { get; set; } + + [LoadColumn(9)] + public ushort U2 { get; set; } + + [LoadColumn(10)] + public int I4 { get; set; } + + [LoadColumn(11)] + public uint U4 { get; set; } + + [LoadColumn(12)] + public long I8 { get; set; } + + [LoadColumn(13)] + public ulong U8 { get; set; } + + [LoadColumn(14)] + public float R4 { get; set; } + + [LoadColumn(15)] + public double R8 { get; set; } + + [LoadColumn(16)] + public ReadOnlyMemory Tx { get; set; } + + [LoadColumn(17)] + public TimeSpan Ts { get; set; } + + [LoadColumn(18)] + public DateTime Dt { get; set; } + + [LoadColumn(19)] + public DateTimeOffset Dz { get; set; } + + [LoadColumn(20)] + public RowId Ug { get; set; } + + /// + /// Get the text loader for the AllTypes dataset. + /// + /// The ML Context. + /// The Separator to read with. + /// + public static TextLoader GetTextLoader(MLContext mlContext, char separator) + { + return mlContext.Data.CreateTextLoader( + new[] { + new TextLoader.Column("Label", DataKind.Bool, 0), + new TextLoader.Column("Features", DataKind.R4, 1, 5), + new TextLoader.Column("I1", DataKind.I1, 6), + new TextLoader.Column("U1", DataKind.U1, 7), + new TextLoader.Column("I2", DataKind.I2, 8), + new TextLoader.Column("U2", DataKind.U2, 9), + new TextLoader.Column("I4", DataKind.I4, 10), + new TextLoader.Column("U4", DataKind.U4, 11), + new TextLoader.Column("I8", DataKind.I8, 12), + new TextLoader.Column("U8", DataKind.U8, 13), + new TextLoader.Column("R4", DataKind.R4, 14), + new TextLoader.Column("R8", DataKind.R8, 15), + new TextLoader.Column("Tx", DataKind.TX, 16), + new TextLoader.Column("Ts", DataKind.TS, 17), + new TextLoader.Column("Dt", DataKind.DT, 18), + new TextLoader.Column("Dz", DataKind.DZ, 19), + new TextLoader.Column("Ug", DataKind.UG, 20), + }, + hasHeader: true, + separatorChar: separator); + } + + /// + /// Generate an IEnumerable of AllTypes. + /// + /// The number of AllTypesDataset objects to make. + /// The random seed. + /// An IEnumerable of AllTypes. + public static IEnumerable GenerateDataset(int numExamples = 5, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + yield return new AllTypes + { + Label = rng.NextDouble() > 0.5, + Features = new float[] { + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble(), + (float)rng.NextDouble() + }, + I1 = (sbyte)rng.Next(), + U1 = (byte)rng.Next(), + I2 = (short)rng.Next(), + U2 = (ushort)rng.Next(), + I4 = rng.Next(), + U4 = (uint)rng.Next(), + I8 = (long)rng.Next(), + U8 = (ulong)rng.Next(), + R4 = (float)rng.NextDouble(), + R8 = (double)rng.NextDouble(), + Tx = GetRandomCharSpan(rng), + Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())), + Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), + Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), + Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) + }; + } + } + + private static ReadOnlyMemory GetRandomCharSpan(Random rng, int length = 10) + { + var chars = new char[length]; + for (int i = 0; i < length; i++) + chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~. + return new ReadOnlyMemory(chars); + } + } +} From 8e070976fda373016d92a0a68e793d8b2998b912 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 13 Feb 2019 12:15:18 -0800 Subject: [PATCH 5/6] Addressing PR comments. --- test/Microsoft.ML.Functional.Tests/Common.cs | 78 +++++++++---------- test/Microsoft.ML.Functional.Tests/DataIO.cs | 28 +++---- .../Datasets/{AllTypes.cs => TypeTestData.cs} | 74 +++++++++++------- 3 files changed, 100 insertions(+), 80 deletions(-) rename test/Microsoft.ML.Functional.Tests/Datasets/{AllTypes.cs => TypeTestData.cs} (65%) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index c6515b3dca..c60aed651a 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -15,15 +15,15 @@ namespace Microsoft.ML.Functional.Tests internal static class Common { /// - /// Asssert that an rows are of . + /// Asssert that an rows are of . /// - /// An . - public static void AssertAllTypesDataset(IDataView allTypesDataset) + /// An . + public static void AssertTypeTestDataset(IDataView testTypeDataset) { - var toyClassProperties = typeof(AllTypes).GetProperties(); + var toyClassProperties = typeof(TypeTestData).GetProperties(); // Check that the schema is of the right size. - Assert.Equal(toyClassProperties.Length, allTypesDataset.Schema.Count); + Assert.Equal(toyClassProperties.Length, testTypeDataset.Schema.Count); // Create a lookup table for the types and counts of all properties. var types = new Dictionary(); @@ -44,7 +44,7 @@ public static void AssertAllTypesDataset(IDataView allTypesDataset) counts[property.Name] = 0; } - foreach (var column in allTypesDataset.Schema) + foreach (var column in testTypeDataset.Schema) { Assert.True(types.ContainsKey(column.Name)); Assert.Equal(1, ++counts[column.Name]); @@ -57,23 +57,23 @@ public static void AssertAllTypesDataset(IDataView allTypesDataset) } /// - /// Assert than two datasets are equal. + /// Assert than two datasets are equal. /// /// The ML Context. - /// A of - /// A of - public static void AssertAllTypesDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) + /// A of + /// A of + public static void AssertTestTypeDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) { // Confirm that they are both of the propery row type. - AssertAllTypesDataset(data1); - AssertAllTypesDataset(data2); + AssertTypeTestDataset(data1); + AssertTypeTestDataset(data2); // Validate that the two Schemas are the same. Common.AssertEqual(data1.Schema, data2.Schema); // Define how to serialize the IDataView to objects. - var enumerable1 = mlContext.CreateEnumerable(data1, true); - var enumerable2 = mlContext.CreateEnumerable(data2, true); + var enumerable1 = mlContext.CreateEnumerable(data1, true); + var enumerable2 = mlContext.CreateEnumerable(data2, true); AssertEqual(enumerable1, enumerable2); } @@ -118,11 +118,11 @@ public static void AssertEqual(Schema schema1, Schema schema2) } /// - /// Assert than two enumerables are equal. + /// Assert than two enumerables are equal. /// - /// An enumerable of - /// An enumerable of - public static void AssertEqual(IEnumerable data1, IEnumerable data2) + /// An enumerable of + /// An enumerable of + public static void AssertEqual(IEnumerable data1, IEnumerable data2) { Assert.NotNull(data1); Assert.NotNull(data2); @@ -135,29 +135,29 @@ public static void AssertEqual(IEnumerable data1, IEnumerable - /// Assert that two AllTypes datasets are equal. + /// Assert that two TypeTest datasets are equal. /// - /// An . - /// An . - public static void AssertEqual(AllTypes allTypes1, AllTypes allTypes2) + /// An . + /// An . + public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) { - Assert.Equal(allTypes1.Label, allTypes2.Label); - Common.AssertEqual(allTypes1.Features, allTypes2.Features); - Assert.Equal(allTypes1.I1, allTypes2.I1); - Assert.Equal(allTypes1.U1, allTypes2.U1); - Assert.Equal(allTypes1.I2, allTypes2.I2); - Assert.Equal(allTypes1.U2, allTypes2.U2); - Assert.Equal(allTypes1.I4, allTypes2.I4); - Assert.Equal(allTypes1.U4, allTypes2.U4); - Assert.Equal(allTypes1.I8, allTypes2.I8); - Assert.Equal(allTypes1.U8, allTypes2.U8); - Assert.Equal(allTypes1.R4, allTypes2.R4); - Assert.Equal(allTypes1.R8, allTypes2.R8); - Assert.Equal(allTypes1.Tx.ToString(), allTypes2.Tx.ToString()); - Assert.True(allTypes1.Ts.Equals(allTypes2.Ts)); - Assert.True(allTypes1.Dt.Equals(allTypes2.Dt)); - Assert.True(allTypes1.Dz.Equals(allTypes2.Dz)); - Assert.True(allTypes1.Ug.Equals(allTypes2.Ug)); + Assert.Equal(testType1.Label, testType2.Label); + Common.AssertEqual(testType1.Features, testType2.Features); + Assert.Equal(testType1.I1, testType2.I1); + Assert.Equal(testType1.U1, testType2.U1); + Assert.Equal(testType1.I2, testType2.I2); + Assert.Equal(testType1.U2, testType2.U2); + Assert.Equal(testType1.I4, testType2.I4); + Assert.Equal(testType1.U4, testType2.U4); + Assert.Equal(testType1.I8, testType2.I8); + Assert.Equal(testType1.U8, testType2.U8); + Assert.Equal(testType1.R4, testType2.R4); + Assert.Equal(testType1.R8, testType2.R8); + Assert.Equal(testType1.Tx.ToString(), testType2.Tx.ToString()); + Assert.True(testType1.Ts.Equals(testType2.Ts)); + Assert.True(testType1.Dt.Equals(testType2.Dt)); + Assert.True(testType1.Dz.Equals(testType2.Dz)); + Assert.True(testType1.Ug.Equals(testType2.Ug)); } /// diff --git a/test/Microsoft.ML.Functional.Tests/DataIO.cs b/test/Microsoft.ML.Functional.Tests/DataIO.cs index 8b50d50102..5ea12353b8 100644 --- a/test/Microsoft.ML.Functional.Tests/DataIO.cs +++ b/test/Microsoft.ML.Functional.Tests/DataIO.cs @@ -35,9 +35,9 @@ public void ReadFromIEnumerable() var mlContext = new MLContext(seed: 1, conc: 1); // Read the dataset from an enumerable. - var data = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); + var data = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); - Common.AssertAllTypesDataset(data); + Common.AssertTypeTestDataset(data); } /// @@ -49,11 +49,11 @@ public void ExportToIEnumerable() var mlContext = new MLContext(seed: 1, conc: 1); // Read the dataset from an enumerable. - var enumerableBefore = AllTypes.GenerateDataset(); + var enumerableBefore = TypeTestData.GenerateDataset(); var data = mlContext.Data.ReadFromEnumerable(enumerableBefore); // Export back to an enumerable. - var enumerableAfter = mlContext.CreateEnumerable(data, true); + var enumerableAfter = mlContext.CreateEnumerable(data, true); Common.AssertEqual(enumerableBefore, enumerableAfter); } @@ -62,21 +62,21 @@ public void ExportToIEnumerable() /// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file. /// /// - /// Tests the roundtrip hrough a file using explicit schematization. + /// Tests the roundtrip through a file using explicit schematization. /// [Fact] public void WriteToAndReadFromADelimetedFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); foreach (var separator in _separators) { // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); - var dataAfter = AllTypes.GetTextLoader(mlContext, separator).Read(filePath); - Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); + var dataAfter = TypeTestData.GetTextLoader(mlContext, separator).Read(filePath); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); } } @@ -84,21 +84,21 @@ public void WriteToAndReadFromADelimetedFile() /// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file. /// /// - /// Tests the roundtrip hrough a file using schema inference. + /// Tests the roundtrip through a file using schema inference. /// [Fact] public void WriteToAndReadASchemaFromADelimitedFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); foreach (var separator in _separators) { // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); - var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); - Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); + var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); } } @@ -110,12 +110,12 @@ public void WriteAndReadAFromABinaryFile() { var mlContext = new MLContext(seed: 1, conc: 1); - var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset()); + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); // Serialize a dataset with a known schema to a file. var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore); var dataAfter = mlContext.Data.ReadFromBinary(filePath); - Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); } private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs similarity index 65% rename from test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs rename to test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs index 2eed366a0e..0c52d4cf0b 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/AllTypes.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs @@ -12,7 +12,14 @@ namespace Microsoft.ML.Functional.Tests.Datasets { - internal sealed class AllTypes + /// + /// A class containing one property per . + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class TypeTestData { [LoadColumn(0)] public bool Label { get; set; } @@ -66,7 +73,7 @@ internal sealed class AllTypes public RowId Ug { get; set; } /// - /// Get the text loader for the AllTypes dataset. + /// Get the text loader for the dataset. /// /// The ML Context. /// The Separator to read with. @@ -98,43 +105,56 @@ public static TextLoader GetTextLoader(MLContext mlContext, char separator) } /// - /// Generate an IEnumerable of AllTypes. + /// Generate an IEnumerable of . /// - /// The number of AllTypesDataset objects to make. + /// The number of objects to make. /// The random seed. - /// An IEnumerable of AllTypes. - public static IEnumerable GenerateDataset(int numExamples = 5, int seed = 1) + /// An IEnumerable of . + public static IEnumerable GenerateDataset(int numExamples = 5, int seed = 1) { var rng = new Random(seed); for (int i = 0; i < numExamples; i++) { - yield return new AllTypes - { - Label = rng.NextDouble() > 0.5, - Features = new float[] { + yield return GetRandomInstance(rng); + } + } + + /// + /// Get a random instance of . + /// + /// A object. + /// + public static TypeTestData GetRandomInstance(Random rng) + { + if (rng == null) + throw new ArgumentNullException("rng"); + + return new TypeTestData + { + Label = rng.NextDouble() > 0.5, + Features = new float[] { (float)rng.NextDouble(), (float)rng.NextDouble(), (float)rng.NextDouble(), (float)rng.NextDouble(), (float)rng.NextDouble() }, - I1 = (sbyte)rng.Next(), - U1 = (byte)rng.Next(), - I2 = (short)rng.Next(), - U2 = (ushort)rng.Next(), - I4 = rng.Next(), - U4 = (uint)rng.Next(), - I8 = (long)rng.Next(), - U8 = (ulong)rng.Next(), - R4 = (float)rng.NextDouble(), - R8 = (double)rng.NextDouble(), - Tx = GetRandomCharSpan(rng), - Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())), - Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), - Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), - Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) - }; - } + I1 = (sbyte)rng.Next(), + U1 = (byte)rng.Next(), + I2 = (short)rng.Next(), + U2 = (ushort)rng.Next(), + I4 = rng.Next(), + U4 = (uint)rng.Next(), + I8 = (long)rng.Next(), + U8 = (ulong)rng.Next(), + R4 = (float)rng.NextDouble(), + R8 = (double)rng.NextDouble(), + Tx = GetRandomCharSpan(rng), + Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())), + Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), + Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), + Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) + }; } private static ReadOnlyMemory GetRandomCharSpan(Random rng, int length = 10) From ab2db38fcdb847d939f976766236bfb8d61cf450 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 13 Feb 2019 13:55:01 -0800 Subject: [PATCH 6/6] Updating TypeTest to have a variable floatarray --- .../Datasets/TypeTestData.cs | 92 ++++++++++--------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs index 0c52d4cf0b..b46e2898a6 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs @@ -4,11 +4,8 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.Data.DataView; -using Microsoft.ML; using Microsoft.ML.Data; -using Xunit; namespace Microsoft.ML.Functional.Tests.Datasets { @@ -21,57 +18,60 @@ namespace Microsoft.ML.Functional.Tests.Datasets /// internal sealed class TypeTestData { + private const int _numFeatures = 10; + [LoadColumn(0)] public bool Label { get; set; } - [LoadColumn(1, 5), VectorType(5)] - public float[] Features { get; set; } - - [LoadColumn(6)] + [LoadColumn(1)] public sbyte I1 { get; set; } - [LoadColumn(7)] + [LoadColumn(2)] public byte U1 { get; set; } - [LoadColumn(8)] + [LoadColumn(3)] public short I2 { get; set; } - [LoadColumn(9)] + [LoadColumn(4)] public ushort U2 { get; set; } - [LoadColumn(10)] + [LoadColumn(5)] public int I4 { get; set; } - [LoadColumn(11)] + [LoadColumn(6)] public uint U4 { get; set; } - [LoadColumn(12)] + [LoadColumn(7)] public long I8 { get; set; } - [LoadColumn(13)] + [LoadColumn(8)] public ulong U8 { get; set; } - [LoadColumn(14)] + [LoadColumn(9)] public float R4 { get; set; } - [LoadColumn(15)] + [LoadColumn(10)] public double R8 { get; set; } - [LoadColumn(16)] + [LoadColumn(11)] public ReadOnlyMemory Tx { get; set; } - [LoadColumn(17)] + [LoadColumn(12)] public TimeSpan Ts { get; set; } - [LoadColumn(18)] + [LoadColumn(13)] public DateTime Dt { get; set; } - [LoadColumn(19)] + [LoadColumn(14)] public DateTimeOffset Dz { get; set; } - [LoadColumn(20)] + [LoadColumn(15)] public RowId Ug { get; set; } + [LoadColumn(16, 16 + _numFeatures - 1), VectorType(_numFeatures)] + public float[] Features { get; set; } + + /// /// Get the text loader for the dataset. /// @@ -83,22 +83,22 @@ public static TextLoader GetTextLoader(MLContext mlContext, char separator) return mlContext.Data.CreateTextLoader( new[] { new TextLoader.Column("Label", DataKind.Bool, 0), - new TextLoader.Column("Features", DataKind.R4, 1, 5), - new TextLoader.Column("I1", DataKind.I1, 6), - new TextLoader.Column("U1", DataKind.U1, 7), - new TextLoader.Column("I2", DataKind.I2, 8), - new TextLoader.Column("U2", DataKind.U2, 9), - new TextLoader.Column("I4", DataKind.I4, 10), - new TextLoader.Column("U4", DataKind.U4, 11), - new TextLoader.Column("I8", DataKind.I8, 12), - new TextLoader.Column("U8", DataKind.U8, 13), - new TextLoader.Column("R4", DataKind.R4, 14), - new TextLoader.Column("R8", DataKind.R8, 15), - new TextLoader.Column("Tx", DataKind.TX, 16), - new TextLoader.Column("Ts", DataKind.TS, 17), - new TextLoader.Column("Dt", DataKind.DT, 18), - new TextLoader.Column("Dz", DataKind.DZ, 19), - new TextLoader.Column("Ug", DataKind.UG, 20), + new TextLoader.Column("I1", DataKind.I1, 1), + new TextLoader.Column("U1", DataKind.U1, 2), + new TextLoader.Column("I2", DataKind.I2, 3), + new TextLoader.Column("U2", DataKind.U2, 4), + new TextLoader.Column("I4", DataKind.I4, 5), + new TextLoader.Column("U4", DataKind.U4, 6), + new TextLoader.Column("I8", DataKind.I8, 7), + new TextLoader.Column("U8", DataKind.U8, 8), + new TextLoader.Column("R4", DataKind.R4, 9), + new TextLoader.Column("R8", DataKind.R8, 10), + new TextLoader.Column("Tx", DataKind.TX, 11), + new TextLoader.Column("Ts", DataKind.TS, 12), + new TextLoader.Column("Dt", DataKind.DT, 13), + new TextLoader.Column("Dz", DataKind.DZ, 14), + new TextLoader.Column("Ug", DataKind.UG, 15), + new TextLoader.Column("Features", DataKind.R4, 16, 16 + _numFeatures-1), }, hasHeader: true, separatorChar: separator); @@ -132,13 +132,6 @@ public static TypeTestData GetRandomInstance(Random rng) return new TypeTestData { Label = rng.NextDouble() > 0.5, - Features = new float[] { - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble(), - (float)rng.NextDouble() - }, I1 = (sbyte)rng.Next(), U1 = (byte)rng.Next(), I2 = (short)rng.Next(), @@ -153,7 +146,8 @@ public static TypeTestData GetRandomInstance(Random rng) Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())), Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), - Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()) + Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()), + Features = GetRandomFloatArray(rng, _numFeatures), }; } @@ -164,5 +158,13 @@ private static ReadOnlyMemory GetRandomCharSpan(Random rng, int length = 1 chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~. return new ReadOnlyMemory(chars); } + + private static float[] GetRandomFloatArray(Random rng, int length) + { + var floatArray = new float[length]; + for (int i = 0; i < length; i++) + floatArray[i] = (float)rng.NextDouble(); + return floatArray; + } } }