diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 29088298d3..c60aed651a 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -2,19 +2,171 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Collections.Generic; +using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; -using Microsoft.ML.SamplesUtils; -using Microsoft.ML.Trainers.HalLearners; +using Microsoft.ML.Functional.Tests.Datasets; using Xunit; namespace Microsoft.ML.Functional.Tests { internal static class Common { + /// + /// Asssert that an rows are of . + /// + /// An . + public static void AssertTypeTestDataset(IDataView testTypeDataset) + { + var toyClassProperties = typeof(TypeTestData).GetProperties(); + + // Check that the schema is of the right size. + Assert.Equal(toyClassProperties.Length, testTypeDataset.Schema.Count); + + // Create a lookup table for the types and counts of all properties. + var types = new Dictionary(); + var counts = new Dictionary(); + foreach (var property in toyClassProperties) + { + if (!property.PropertyType.IsArray) + types[property.Name] = property.PropertyType; + else + { + // Construct a VBuffer type for the array. + var vBufferType = typeof(VBuffer<>); + Type[] typeArgs = { property.PropertyType.GetElementType() }; + Activator.CreateInstance(property.PropertyType.GetElementType()); + types[property.Name] = vBufferType.MakeGenericType(typeArgs); + } + + counts[property.Name] = 0; + } + + foreach (var column in testTypeDataset.Schema) + { + Assert.True(types.ContainsKey(column.Name)); + Assert.Equal(1, ++counts[column.Name]); + Assert.Equal(types[column.Name], column.Type.RawType); + } + + // Make sure we didn't miss any columns. + foreach (var value in counts.Values) + Assert.Equal(1, value); + } + + /// + /// Assert than two datasets are equal. + /// + /// The ML Context. + /// A of + /// A of + public static void AssertTestTypeDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2) + { + // Confirm that they are both of the propery row type. + AssertTypeTestDataset(data1); + AssertTypeTestDataset(data2); + + // Validate that the two Schemas are the same. + Common.AssertEqual(data1.Schema, data2.Schema); + + // Define how to serialize the IDataView to objects. + var enumerable1 = mlContext.CreateEnumerable(data1, true); + var enumerable2 = mlContext.CreateEnumerable(data2, true); + + AssertEqual(enumerable1, enumerable2); + } + + /// + /// Assert that two float arrays are equal. + /// + /// An array of floats. + /// An array of floats. + public static void AssertEqual(float[] array1, float[] array2) + { + Assert.NotNull(array1); + Assert.NotNull(array2); + Assert.Equal(array1.Length, array2.Length); + + for (int i = 0; i < array1.Length; i++) + Assert.Equal(array1[i], array2[i]); + } + + /// + /// Assert that two objects are equal. + /// + /// A object. + /// A object. + public static void AssertEqual(Schema schema1, Schema schema2) + { + Assert.NotNull(schema1); + Assert.NotNull(schema2); + + Assert.Equal(schema1.Count(), schema2.Count()); + + foreach (var schemaPair in schema1.Zip(schema2, Tuple.Create)) + { + Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name); + Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index); + Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden); + // Can probably do a better comparison of Metadata. + AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema); + Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) || + (schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType)); + } + } + + /// + /// Assert than two enumerables are equal. + /// + /// An enumerable of + /// An enumerable of + public static void AssertEqual(IEnumerable data1, IEnumerable data2) + { + Assert.NotNull(data1); + Assert.NotNull(data2); + Assert.Equal(data1.Count(), data2.Count()); + + foreach (var rowPair in data1.Zip(data2, Tuple.Create)) + { + AssertEqual(rowPair.Item1, rowPair.Item2); + } + } + + /// + /// Assert that two TypeTest datasets are equal. + /// + /// An . + /// An . + public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) + { + Assert.Equal(testType1.Label, testType2.Label); + Common.AssertEqual(testType1.Features, testType2.Features); + Assert.Equal(testType1.I1, testType2.I1); + Assert.Equal(testType1.U1, testType2.U1); + Assert.Equal(testType1.I2, testType2.I2); + Assert.Equal(testType1.U2, testType2.U2); + Assert.Equal(testType1.I4, testType2.I4); + Assert.Equal(testType1.U4, testType2.U4); + Assert.Equal(testType1.I8, testType2.I8); + Assert.Equal(testType1.U8, testType2.U8); + Assert.Equal(testType1.R4, testType2.R4); + Assert.Equal(testType1.R8, testType2.R8); + Assert.Equal(testType1.Tx.ToString(), testType2.Tx.ToString()); + Assert.True(testType1.Ts.Equals(testType2.Ts)); + Assert.True(testType1.Dt.Equals(testType2.Dt)); + Assert.True(testType1.Dz.Equals(testType2.Dz)); + Assert.True(testType1.Ug.Equals(testType2.Ug)); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. public static void CheckMetrics(RegressionMetrics metrics) { - // Perform sanity checks on the metrics + // Perform sanity checks on the metrics. Assert.True(metrics.Rms >= 0); Assert.True(metrics.L1 >= 0); Assert.True(metrics.L2 >= 0); diff --git a/test/Microsoft.ML.Functional.Tests/DataIO.cs b/test/Microsoft.ML.Functional.Tests/DataIO.cs new file mode 100644 index 0000000000..5ea12353b8 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/DataIO.cs @@ -0,0 +1,139 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.TestFramework; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + /// + /// Test data input and output formats. + /// + public class DataIO : BaseTestClass + { + // Separators to test + private readonly char[] _separators; + + public DataIO(ITestOutputHelper output) : base(output) + { + // SaveAsText expects a "space, tab, comma, semicolon, or bar". + _separators = new char[] { ' ', '\t', ',', ';', '|', }; + } + + /// + /// Read from Enumerable: In-Memory objects can be read as enumerables into an IDatView. + /// + [Fact] + public void ReadFromIEnumerable() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Read the dataset from an enumerable. + var data = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); + + Common.AssertTypeTestDataset(data); + } + + /// + /// Export to Enumerable: IDatViews can be exported as enumerables of a class. + /// + [Fact] + public void ExportToIEnumerable() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Read the dataset from an enumerable. + var enumerableBefore = TypeTestData.GenerateDataset(); + var data = mlContext.Data.ReadFromEnumerable(enumerableBefore); + + // Export back to an enumerable. + var enumerableAfter = mlContext.CreateEnumerable(data, true); + + Common.AssertEqual(enumerableBefore, enumerableAfter); + } + + /// + /// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file. + /// + /// + /// Tests the roundtrip through a file using explicit schematization. + /// + [Fact] + public void WriteToAndReadFromADelimetedFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); + + foreach (var separator in _separators) + { + // Serialize a dataset with a known schema to a file. + var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); + var dataAfter = TypeTestData.GetTextLoader(mlContext, separator).Read(filePath); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + } + + /// + /// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file. + /// + /// + /// Tests the roundtrip through a file using schema inference. + /// + [Fact] + public void WriteToAndReadASchemaFromADelimitedFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); + + foreach (var separator in _separators) + { + // Serialize a dataset with a known schema to a file. + var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator); + var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + } + + /// + /// Wrie to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file. + /// + [Fact] + public void WriteAndReadAFromABinaryFile() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset()); + + // Serialize a dataset with a known schema to a file. + var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore); + var dataAfter = mlContext.Data.ReadFromBinary(filePath); + Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter); + } + + private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator) + { + var filePath = GetOutputPath(Path.GetRandomFileName()); + using (var file = File.Create(filePath)) + mlContext.Data.SaveAsText(data, file, separatorChar: separator, headerRow: true); + + return filePath; + } + + private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data) + { + var filePath = GetOutputPath(Path.GetRandomFileName()); + using (var file = File.Create(filePath)) + mlContext.Data.SaveAsBinary(data, file); + + return filePath; + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs new file mode 100644 index 0000000000..b46e2898a6 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs @@ -0,0 +1,170 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class containing one property per . + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class TypeTestData + { + private const int _numFeatures = 10; + + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1)] + public sbyte I1 { get; set; } + + [LoadColumn(2)] + public byte U1 { get; set; } + + [LoadColumn(3)] + public short I2 { get; set; } + + [LoadColumn(4)] + public ushort U2 { get; set; } + + [LoadColumn(5)] + public int I4 { get; set; } + + [LoadColumn(6)] + public uint U4 { get; set; } + + [LoadColumn(7)] + public long I8 { get; set; } + + [LoadColumn(8)] + public ulong U8 { get; set; } + + [LoadColumn(9)] + public float R4 { get; set; } + + [LoadColumn(10)] + public double R8 { get; set; } + + [LoadColumn(11)] + public ReadOnlyMemory Tx { get; set; } + + [LoadColumn(12)] + public TimeSpan Ts { get; set; } + + [LoadColumn(13)] + public DateTime Dt { get; set; } + + [LoadColumn(14)] + public DateTimeOffset Dz { get; set; } + + [LoadColumn(15)] + public RowId Ug { get; set; } + + [LoadColumn(16, 16 + _numFeatures - 1), VectorType(_numFeatures)] + public float[] Features { get; set; } + + + /// + /// Get the text loader for the dataset. + /// + /// The ML Context. + /// The Separator to read with. + /// + public static TextLoader GetTextLoader(MLContext mlContext, char separator) + { + return mlContext.Data.CreateTextLoader( + new[] { + new TextLoader.Column("Label", DataKind.Bool, 0), + new TextLoader.Column("I1", DataKind.I1, 1), + new TextLoader.Column("U1", DataKind.U1, 2), + new TextLoader.Column("I2", DataKind.I2, 3), + new TextLoader.Column("U2", DataKind.U2, 4), + new TextLoader.Column("I4", DataKind.I4, 5), + new TextLoader.Column("U4", DataKind.U4, 6), + new TextLoader.Column("I8", DataKind.I8, 7), + new TextLoader.Column("U8", DataKind.U8, 8), + new TextLoader.Column("R4", DataKind.R4, 9), + new TextLoader.Column("R8", DataKind.R8, 10), + new TextLoader.Column("Tx", DataKind.TX, 11), + new TextLoader.Column("Ts", DataKind.TS, 12), + new TextLoader.Column("Dt", DataKind.DT, 13), + new TextLoader.Column("Dz", DataKind.DZ, 14), + new TextLoader.Column("Ug", DataKind.UG, 15), + new TextLoader.Column("Features", DataKind.R4, 16, 16 + _numFeatures-1), + }, + hasHeader: true, + separatorChar: separator); + } + + /// + /// Generate an IEnumerable of . + /// + /// The number of objects to make. + /// The random seed. + /// An IEnumerable of . + public static IEnumerable GenerateDataset(int numExamples = 5, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + yield return GetRandomInstance(rng); + } + } + + /// + /// Get a random instance of . + /// + /// A object. + /// + public static TypeTestData GetRandomInstance(Random rng) + { + if (rng == null) + throw new ArgumentNullException("rng"); + + return new TypeTestData + { + Label = rng.NextDouble() > 0.5, + I1 = (sbyte)rng.Next(), + U1 = (byte)rng.Next(), + I2 = (short)rng.Next(), + U2 = (ushort)rng.Next(), + I4 = rng.Next(), + U4 = (uint)rng.Next(), + I8 = (long)rng.Next(), + U8 = (ulong)rng.Next(), + R4 = (float)rng.NextDouble(), + R8 = (double)rng.NextDouble(), + Tx = GetRandomCharSpan(rng), + Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())), + Dt = DateTime.FromOADate(rng.Next(657435, 2958465)), + Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))), + Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()), + Features = GetRandomFloatArray(rng, _numFeatures), + }; + } + + private static ReadOnlyMemory GetRandomCharSpan(Random rng, int length = 10) + { + var chars = new char[length]; + for (int i = 0; i < length; i++) + chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~. + return new ReadOnlyMemory(chars); + } + + private static float[] GetRandomFloatArray(Random rng, int length) + { + var floatArray = new float[length]; + for (int i = 0; i < length; i++) + floatArray[i] = (float)rng.NextDouble(); + return floatArray; + } + } +} diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs deleted file mode 100644 index 012b769c84..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs +++ /dev/null @@ -1,49 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.IO; -using Microsoft.ML.Data; -using Microsoft.ML.Data.IO; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// File-based saving of data: Come up with transform pipeline. Transform training and - /// test data, and save the featurized data to some file, using the .idv format. - /// Train and evaluate multiple models over that pre-featurized data. (Useful for - /// sweeping scenarios, where you are training many times on the same data, - /// and don't necessarily want to transform it every single time.) - /// - [Fact] - void FileBasedSavingOfData() - { - - var ml = new MLContext(seed: 1, conc: 1); - var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var trainData = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Append(ml.Transforms.Text.FeaturizeText("Features", "SentimentText")) - .Fit(src).Read(src); - - var path = DeleteOutputPath("i.idv"); - using (var file = File.Create(path)) - { - var saver = new BinarySaver(ml, new BinarySaver.Arguments()); - using (var ch = ((IHostEnvironment)ml).Start("SaveData")) - DataSaverUtils.SaveDataView(ch, saver, trainData, file); - } - - var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( - new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 }); - var loadedTrainData = new BinaryLoader(ml, new BinaryLoader.Arguments(), new MultiFileSource(path)); - - // Train. - var model = trainer.Fit(loadedTrainData); - } - } -}