diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs
index 29088298d3..c60aed651a 100644
--- a/test/Microsoft.ML.Functional.Tests/Common.cs
+++ b/test/Microsoft.ML.Functional.Tests/Common.cs
@@ -2,19 +2,171 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
+using System;
+using System.Collections.Generic;
+using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
-using Microsoft.ML.SamplesUtils;
-using Microsoft.ML.Trainers.HalLearners;
+using Microsoft.ML.Functional.Tests.Datasets;
using Xunit;
namespace Microsoft.ML.Functional.Tests
{
internal static class Common
{
+ ///
+ /// Asssert that an rows are of .
+ ///
+ /// An .
+ public static void AssertTypeTestDataset(IDataView testTypeDataset)
+ {
+ var toyClassProperties = typeof(TypeTestData).GetProperties();
+
+ // Check that the schema is of the right size.
+ Assert.Equal(toyClassProperties.Length, testTypeDataset.Schema.Count);
+
+ // Create a lookup table for the types and counts of all properties.
+ var types = new Dictionary();
+ var counts = new Dictionary();
+ foreach (var property in toyClassProperties)
+ {
+ if (!property.PropertyType.IsArray)
+ types[property.Name] = property.PropertyType;
+ else
+ {
+ // Construct a VBuffer type for the array.
+ var vBufferType = typeof(VBuffer<>);
+ Type[] typeArgs = { property.PropertyType.GetElementType() };
+ Activator.CreateInstance(property.PropertyType.GetElementType());
+ types[property.Name] = vBufferType.MakeGenericType(typeArgs);
+ }
+
+ counts[property.Name] = 0;
+ }
+
+ foreach (var column in testTypeDataset.Schema)
+ {
+ Assert.True(types.ContainsKey(column.Name));
+ Assert.Equal(1, ++counts[column.Name]);
+ Assert.Equal(types[column.Name], column.Type.RawType);
+ }
+
+ // Make sure we didn't miss any columns.
+ foreach (var value in counts.Values)
+ Assert.Equal(1, value);
+ }
+
+ ///
+ /// Assert than two datasets are equal.
+ ///
+ /// The ML Context.
+ /// A of
+ /// A of
+ public static void AssertTestTypeDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2)
+ {
+ // Confirm that they are both of the propery row type.
+ AssertTypeTestDataset(data1);
+ AssertTypeTestDataset(data2);
+
+ // Validate that the two Schemas are the same.
+ Common.AssertEqual(data1.Schema, data2.Schema);
+
+ // Define how to serialize the IDataView to objects.
+ var enumerable1 = mlContext.CreateEnumerable(data1, true);
+ var enumerable2 = mlContext.CreateEnumerable(data2, true);
+
+ AssertEqual(enumerable1, enumerable2);
+ }
+
+ ///
+ /// Assert that two float arrays are equal.
+ ///
+ /// An array of floats.
+ /// An array of floats.
+ public static void AssertEqual(float[] array1, float[] array2)
+ {
+ Assert.NotNull(array1);
+ Assert.NotNull(array2);
+ Assert.Equal(array1.Length, array2.Length);
+
+ for (int i = 0; i < array1.Length; i++)
+ Assert.Equal(array1[i], array2[i]);
+ }
+
+ ///
+ /// Assert that two objects are equal.
+ ///
+ /// A object.
+ /// A object.
+ public static void AssertEqual(Schema schema1, Schema schema2)
+ {
+ Assert.NotNull(schema1);
+ Assert.NotNull(schema2);
+
+ Assert.Equal(schema1.Count(), schema2.Count());
+
+ foreach (var schemaPair in schema1.Zip(schema2, Tuple.Create))
+ {
+ Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name);
+ Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index);
+ Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden);
+ // Can probably do a better comparison of Metadata.
+ AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema);
+ Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) ||
+ (schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType));
+ }
+ }
+
+ ///
+ /// Assert than two enumerables are equal.
+ ///
+ /// An enumerable of
+ /// An enumerable of
+ public static void AssertEqual(IEnumerable data1, IEnumerable data2)
+ {
+ Assert.NotNull(data1);
+ Assert.NotNull(data2);
+ Assert.Equal(data1.Count(), data2.Count());
+
+ foreach (var rowPair in data1.Zip(data2, Tuple.Create))
+ {
+ AssertEqual(rowPair.Item1, rowPair.Item2);
+ }
+ }
+
+ ///
+ /// Assert that two TypeTest datasets are equal.
+ ///
+ /// An .
+ /// An .
+ public static void AssertEqual(TypeTestData testType1, TypeTestData testType2)
+ {
+ Assert.Equal(testType1.Label, testType2.Label);
+ Common.AssertEqual(testType1.Features, testType2.Features);
+ Assert.Equal(testType1.I1, testType2.I1);
+ Assert.Equal(testType1.U1, testType2.U1);
+ Assert.Equal(testType1.I2, testType2.I2);
+ Assert.Equal(testType1.U2, testType2.U2);
+ Assert.Equal(testType1.I4, testType2.I4);
+ Assert.Equal(testType1.U4, testType2.U4);
+ Assert.Equal(testType1.I8, testType2.I8);
+ Assert.Equal(testType1.U8, testType2.U8);
+ Assert.Equal(testType1.R4, testType2.R4);
+ Assert.Equal(testType1.R8, testType2.R8);
+ Assert.Equal(testType1.Tx.ToString(), testType2.Tx.ToString());
+ Assert.True(testType1.Ts.Equals(testType2.Ts));
+ Assert.True(testType1.Dt.Equals(testType2.Dt));
+ Assert.True(testType1.Dz.Equals(testType2.Dz));
+ Assert.True(testType1.Ug.Equals(testType2.Ug));
+ }
+
+ ///
+ /// Check that a object is valid.
+ ///
+ /// The metrics object.
public static void CheckMetrics(RegressionMetrics metrics)
{
- // Perform sanity checks on the metrics
+ // Perform sanity checks on the metrics.
Assert.True(metrics.Rms >= 0);
Assert.True(metrics.L1 >= 0);
Assert.True(metrics.L2 >= 0);
diff --git a/test/Microsoft.ML.Functional.Tests/DataIO.cs b/test/Microsoft.ML.Functional.Tests/DataIO.cs
new file mode 100644
index 0000000000..5ea12353b8
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/DataIO.cs
@@ -0,0 +1,139 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using Microsoft.Data.DataView;
+using Microsoft.ML.Data;
+using Microsoft.ML.Functional.Tests.Datasets;
+using Microsoft.ML.TestFramework;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Functional.Tests
+{
+ ///
+ /// Test data input and output formats.
+ ///
+ public class DataIO : BaseTestClass
+ {
+ // Separators to test
+ private readonly char[] _separators;
+
+ public DataIO(ITestOutputHelper output) : base(output)
+ {
+ // SaveAsText expects a "space, tab, comma, semicolon, or bar".
+ _separators = new char[] { ' ', '\t', ',', ';', '|', };
+ }
+
+ ///
+ /// Read from Enumerable: In-Memory objects can be read as enumerables into an IDatView.
+ ///
+ [Fact]
+ public void ReadFromIEnumerable()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ // Read the dataset from an enumerable.
+ var data = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());
+
+ Common.AssertTypeTestDataset(data);
+ }
+
+ ///
+ /// Export to Enumerable: IDatViews can be exported as enumerables of a class.
+ ///
+ [Fact]
+ public void ExportToIEnumerable()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ // Read the dataset from an enumerable.
+ var enumerableBefore = TypeTestData.GenerateDataset();
+ var data = mlContext.Data.ReadFromEnumerable(enumerableBefore);
+
+ // Export back to an enumerable.
+ var enumerableAfter = mlContext.CreateEnumerable(data, true);
+
+ Common.AssertEqual(enumerableBefore, enumerableAfter);
+ }
+
+ ///
+ /// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file.
+ ///
+ ///
+ /// Tests the roundtrip through a file using explicit schematization.
+ ///
+ [Fact]
+ public void WriteToAndReadFromADelimetedFile()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());
+
+ foreach (var separator in _separators)
+ {
+ // Serialize a dataset with a known schema to a file.
+ var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
+ var dataAfter = TypeTestData.GetTextLoader(mlContext, separator).Read(filePath);
+ Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
+ }
+ }
+
+ ///
+ /// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
+ ///
+ ///
+ /// Tests the roundtrip through a file using schema inference.
+ ///
+ [Fact]
+ public void WriteToAndReadASchemaFromADelimitedFile()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());
+
+ foreach (var separator in _separators)
+ {
+ // Serialize a dataset with a known schema to a file.
+ var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
+ var dataAfter = mlContext.Data.ReadFromTextFile(filePath, hasHeader: true, separatorChar: separator);
+ Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
+ }
+ }
+
+ ///
+ /// Wrie to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
+ ///
+ [Fact]
+ public void WriteAndReadAFromABinaryFile()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());
+
+ // Serialize a dataset with a known schema to a file.
+ var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore);
+ var dataAfter = mlContext.Data.ReadFromBinary(filePath);
+ Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
+ }
+
+ private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator)
+ {
+ var filePath = GetOutputPath(Path.GetRandomFileName());
+ using (var file = File.Create(filePath))
+ mlContext.Data.SaveAsText(data, file, separatorChar: separator, headerRow: true);
+
+ return filePath;
+ }
+
+ private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data)
+ {
+ var filePath = GetOutputPath(Path.GetRandomFileName());
+ using (var file = File.Create(filePath))
+ mlContext.Data.SaveAsBinary(data, file);
+
+ return filePath;
+ }
+ }
+}
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs
new file mode 100644
index 0000000000..b46e2898a6
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs
@@ -0,0 +1,170 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Data.DataView;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Functional.Tests.Datasets
+{
+ ///
+ /// A class containing one property per .
+ ///
+ ///
+ /// This class has annotations for automatic deserialization from a file, and contains helper methods
+ /// for reading from a file and for generating a random dataset as an IEnumerable.
+ ///
+ internal sealed class TypeTestData
+ {
+ private const int _numFeatures = 10;
+
+ [LoadColumn(0)]
+ public bool Label { get; set; }
+
+ [LoadColumn(1)]
+ public sbyte I1 { get; set; }
+
+ [LoadColumn(2)]
+ public byte U1 { get; set; }
+
+ [LoadColumn(3)]
+ public short I2 { get; set; }
+
+ [LoadColumn(4)]
+ public ushort U2 { get; set; }
+
+ [LoadColumn(5)]
+ public int I4 { get; set; }
+
+ [LoadColumn(6)]
+ public uint U4 { get; set; }
+
+ [LoadColumn(7)]
+ public long I8 { get; set; }
+
+ [LoadColumn(8)]
+ public ulong U8 { get; set; }
+
+ [LoadColumn(9)]
+ public float R4 { get; set; }
+
+ [LoadColumn(10)]
+ public double R8 { get; set; }
+
+ [LoadColumn(11)]
+ public ReadOnlyMemory Tx { get; set; }
+
+ [LoadColumn(12)]
+ public TimeSpan Ts { get; set; }
+
+ [LoadColumn(13)]
+ public DateTime Dt { get; set; }
+
+ [LoadColumn(14)]
+ public DateTimeOffset Dz { get; set; }
+
+ [LoadColumn(15)]
+ public RowId Ug { get; set; }
+
+ [LoadColumn(16, 16 + _numFeatures - 1), VectorType(_numFeatures)]
+ public float[] Features { get; set; }
+
+
+ ///
+ /// Get the text loader for the dataset.
+ ///
+ /// The ML Context.
+ /// The Separator to read with.
+ ///
+ public static TextLoader GetTextLoader(MLContext mlContext, char separator)
+ {
+ return mlContext.Data.CreateTextLoader(
+ new[] {
+ new TextLoader.Column("Label", DataKind.Bool, 0),
+ new TextLoader.Column("I1", DataKind.I1, 1),
+ new TextLoader.Column("U1", DataKind.U1, 2),
+ new TextLoader.Column("I2", DataKind.I2, 3),
+ new TextLoader.Column("U2", DataKind.U2, 4),
+ new TextLoader.Column("I4", DataKind.I4, 5),
+ new TextLoader.Column("U4", DataKind.U4, 6),
+ new TextLoader.Column("I8", DataKind.I8, 7),
+ new TextLoader.Column("U8", DataKind.U8, 8),
+ new TextLoader.Column("R4", DataKind.R4, 9),
+ new TextLoader.Column("R8", DataKind.R8, 10),
+ new TextLoader.Column("Tx", DataKind.TX, 11),
+ new TextLoader.Column("Ts", DataKind.TS, 12),
+ new TextLoader.Column("Dt", DataKind.DT, 13),
+ new TextLoader.Column("Dz", DataKind.DZ, 14),
+ new TextLoader.Column("Ug", DataKind.UG, 15),
+ new TextLoader.Column("Features", DataKind.R4, 16, 16 + _numFeatures-1),
+ },
+ hasHeader: true,
+ separatorChar: separator);
+ }
+
+ ///
+ /// Generate an IEnumerable of .
+ ///
+ /// The number of objects to make.
+ /// The random seed.
+ /// An IEnumerable of .
+ public static IEnumerable GenerateDataset(int numExamples = 5, int seed = 1)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < numExamples; i++)
+ {
+ yield return GetRandomInstance(rng);
+ }
+ }
+
+ ///
+ /// Get a random instance of .
+ ///
+ /// A object.
+ ///
+ public static TypeTestData GetRandomInstance(Random rng)
+ {
+ if (rng == null)
+ throw new ArgumentNullException("rng");
+
+ return new TypeTestData
+ {
+ Label = rng.NextDouble() > 0.5,
+ I1 = (sbyte)rng.Next(),
+ U1 = (byte)rng.Next(),
+ I2 = (short)rng.Next(),
+ U2 = (ushort)rng.Next(),
+ I4 = rng.Next(),
+ U4 = (uint)rng.Next(),
+ I8 = (long)rng.Next(),
+ U8 = (ulong)rng.Next(),
+ R4 = (float)rng.NextDouble(),
+ R8 = (double)rng.NextDouble(),
+ Tx = GetRandomCharSpan(rng),
+ Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())),
+ Dt = DateTime.FromOADate(rng.Next(657435, 2958465)),
+ Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))),
+ Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()),
+ Features = GetRandomFloatArray(rng, _numFeatures),
+ };
+ }
+
+ private static ReadOnlyMemory GetRandomCharSpan(Random rng, int length = 10)
+ {
+ var chars = new char[length];
+ for (int i = 0; i < length; i++)
+ chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~.
+ return new ReadOnlyMemory(chars);
+ }
+
+ private static float[] GetRandomFloatArray(Random rng, int length)
+ {
+ var floatArray = new float[length];
+ for (int i = 0; i < length; i++)
+ floatArray[i] = (float)rng.NextDouble();
+ return floatArray;
+ }
+ }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs
deleted file mode 100644
index 012b769c84..0000000000
--- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs
+++ /dev/null
@@ -1,49 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.IO;
-using Microsoft.ML.Data;
-using Microsoft.ML.Data.IO;
-using Microsoft.ML.RunTests;
-using Microsoft.ML.Trainers;
-using Xunit;
-
-namespace Microsoft.ML.Tests.Scenarios.Api
-{
- public partial class ApiScenariosTests
- {
- ///
- /// File-based saving of data: Come up with transform pipeline. Transform training and
- /// test data, and save the featurized data to some file, using the .idv format.
- /// Train and evaluate multiple models over that pre-featurized data. (Useful for
- /// sweeping scenarios, where you are training many times on the same data,
- /// and don't necessarily want to transform it every single time.)
- ///
- [Fact]
- void FileBasedSavingOfData()
- {
-
- var ml = new MLContext(seed: 1, conc: 1);
- var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename));
- var trainData = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true)
- .Append(ml.Transforms.Text.FeaturizeText("Features", "SentimentText"))
- .Fit(src).Read(src);
-
- var path = DeleteOutputPath("i.idv");
- using (var file = File.Create(path))
- {
- var saver = new BinarySaver(ml, new BinarySaver.Arguments());
- using (var ch = ((IHostEnvironment)ml).Start("SaveData"))
- DataSaverUtils.SaveDataView(ch, saver, trainData, file);
- }
-
- var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
- new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 });
- var loadedTrainData = new BinaryLoader(ml, new BinaryLoader.Arguments(), new MultiFileSource(path));
-
- // Train.
- var model = trainer.Fit(loadedTrainData);
- }
- }
-}