Skip to content

Add Functional Tests for Data I/O #2518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 14, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 155 additions & 3 deletions test/Microsoft.ML.Functional.Tests/Common.cs
Original file line number Diff line number Diff line change
@@ -2,19 +2,171 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;
using Microsoft.ML.Trainers.HalLearners;
using Microsoft.ML.Functional.Tests.Datasets;
using Xunit;

namespace Microsoft.ML.Functional.Tests
{
internal static class Common
{
/// <summary>
/// Asssert that an <see cref="IDataView"/> rows are of <see cref="TypeTestData"/>.
/// </summary>
/// <param name="testTypeDataset">An <see cref="IDataView"/>.</param>
public static void AssertTypeTestDataset(IDataView testTypeDataset)
{
var toyClassProperties = typeof(TypeTestData).GetProperties();

// Check that the schema is of the right size.
Assert.Equal(toyClassProperties.Length, testTypeDataset.Schema.Count);

// Create a lookup table for the types and counts of all properties.
var types = new Dictionary<string, Type>();
var counts = new Dictionary<string, int>();
foreach (var property in toyClassProperties)
{
if (!property.PropertyType.IsArray)
types[property.Name] = property.PropertyType;
else
{
// Construct a VBuffer type for the array.
var vBufferType = typeof(VBuffer<>);
Type[] typeArgs = { property.PropertyType.GetElementType() };
Activator.CreateInstance(property.PropertyType.GetElementType());
types[property.Name] = vBufferType.MakeGenericType(typeArgs);
}

counts[property.Name] = 0;
}

foreach (var column in testTypeDataset.Schema)
{
Assert.True(types.ContainsKey(column.Name));
Assert.Equal(1, ++counts[column.Name]);
Assert.Equal(types[column.Name], column.Type.RawType);
}

// Make sure we didn't miss any columns.
foreach (var value in counts.Values)
Assert.Equal(1, value);
}

/// <summary>
/// Assert than two <see cref="TypeTestData"/> datasets are equal.
/// </summary>
/// <param name="mlContext">The ML Context.</param>
/// <param name="data1">A <see cref="IDataView"/> of <see cref="TypeTestData"/></param>
/// <param name="data2">A <see cref="IDataView"/> of <see cref="TypeTestData"/></param>
public static void AssertTestTypeDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2)
{
// Confirm that they are both of the propery row type.
AssertTypeTestDataset(data1);
AssertTypeTestDataset(data2);

// Validate that the two Schemas are the same.
Common.AssertEqual(data1.Schema, data2.Schema);

// Define how to serialize the IDataView to objects.
var enumerable1 = mlContext.CreateEnumerable<TypeTestData>(data1, true);
var enumerable2 = mlContext.CreateEnumerable<TypeTestData>(data2, true);

AssertEqual(enumerable1, enumerable2);
}

/// <summary>
/// Assert that two float arrays are equal.
/// </summary>
/// <param name="array1">An array of floats.</param>
/// <param name="array2">An array of floats.</param>
public static void AssertEqual(float[] array1, float[] array2)
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AssertEqual [](start = 27, length = 11)

Perhaps Enumerable.SequenceEqual(target1, target2); instead? from Linq
https://stackoverflow.com/questions/3232744/easiest-way-to-compare-arrays-in-c-sharp #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to use the assert logic here so that we can capture equality to some level of precision. Default equality will fail after serialization due to floating point drift sometimes.


In reply to: 256208383 [](ancestors = 256208383)

Copy link
Contributor Author

@rogancarr rogancarr Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plus I want to Assert and not check for equality.


In reply to: 256209571 [](ancestors = 256209571,256208383)

{
Assert.NotNull(array1);
Assert.NotNull(array2);
Assert.Equal(array1.Length, array2.Length);

for (int i = 0; i < array1.Length; i++)
Assert.Equal(array1[i], array2[i]);
}

/// <summary>
/// Assert that two <see cref="Schema"/> objects are equal.
/// </summary>
/// <param name="schema1">A <see cref="Schema"/> object.</param>
/// <param name="schema2">A <see cref="Schema"/> object.</param>
public static void AssertEqual(Schema schema1, Schema schema2)
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Schema [](start = 39, length = 6)

Schould Schema implement IEquitable ?
https://stackoverflow.com/questions/8400028/comparing-two-instances-of-a-class #Closed

Copy link
Contributor Author

@rogancarr rogancarr Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be awesome, but probably not before 1.0.


In reply to: 256209052 [](ancestors = 256209052)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on implementing IEquitable. Should have an issue about it.


In reply to: 256209052 [](ancestors = 256209052)

{
Assert.NotNull(schema1);
Assert.NotNull(schema2);
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically if they are both null they are equal... #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically yes. But internally in our library we consider nulls to not equal each other. :shruggy-emoticon:


In reply to: 256209640 [](ancestors = 256209640)


Assert.Equal(schema1.Count(), schema2.Count());

foreach (var schemaPair in schema1.Zip(schema2, Tuple.Create))
{
Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name);
Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index);
Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden);
// Can probably do a better comparison of Metadata.
AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema);
Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) ||
(schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType));
}
}

/// <summary>
/// Assert than two <see cref="TypeTestData"/> enumerables are equal.
/// </summary>
/// <param name="data1">An enumerable of <see cref="TypeTestData"/></param>
/// <param name="data2">An enumerable of <see cref="TypeTestData"/></param>
public static void AssertEqual(IEnumerable<TypeTestData> data1, IEnumerable<TypeTestData> data2)
{
Assert.NotNull(data1);
Assert.NotNull(data2);
Assert.Equal(data1.Count(), data2.Count());

foreach (var rowPair in data1.Zip(data2, Tuple.Create))
{
AssertEqual(rowPair.Item1, rowPair.Item2);
}
}

/// <summary>
/// Assert that two TypeTest datasets are equal.
/// </summary>
/// <param name="testType1">An <see cref="TypeTestData"/>.</param>
/// <param name="testType2">An <see cref="TypeTestData"/>.</param>
public static void AssertEqual(TypeTestData testType1, TypeTestData testType2)
{
Assert.Equal(testType1.Label, testType2.Label);
Common.AssertEqual(testType1.Features, testType2.Features);
Assert.Equal(testType1.I1, testType2.I1);
Assert.Equal(testType1.U1, testType2.U1);
Assert.Equal(testType1.I2, testType2.I2);
Assert.Equal(testType1.U2, testType2.U2);
Assert.Equal(testType1.I4, testType2.I4);
Assert.Equal(testType1.U4, testType2.U4);
Assert.Equal(testType1.I8, testType2.I8);
Assert.Equal(testType1.U8, testType2.U8);
Assert.Equal(testType1.R4, testType2.R4);
Assert.Equal(testType1.R8, testType2.R8);
Assert.Equal(testType1.Tx.ToString(), testType2.Tx.ToString());
Assert.True(testType1.Ts.Equals(testType2.Ts));
Assert.True(testType1.Dt.Equals(testType2.Dt));
Assert.True(testType1.Dz.Equals(testType2.Dz));
Assert.True(testType1.Ug.Equals(testType2.Ug));
}

/// <summary>
/// Check that a <see cref="RegressionMetrics"/> object is valid.
/// </summary>
/// <param name="metrics">The metrics object.</param>
public static void CheckMetrics(RegressionMetrics metrics)
{
// Perform sanity checks on the metrics
// Perform sanity checks on the metrics.
Assert.True(metrics.Rms >= 0);
Assert.True(metrics.L1 >= 0);
Assert.True(metrics.L2 >= 0);
139 changes: 139 additions & 0 deletions test/Microsoft.ML.Functional.Tests/DataIO.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.Functional.Tests.Datasets;
using Microsoft.ML.TestFramework;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.Functional.Tests
{
/// <summary>
/// Test data input and output formats.
/// </summary>
public class DataIO : BaseTestClass
{
// Separators to test
private readonly char[] _separators;

public DataIO(ITestOutputHelper output) : base(output)
{
// SaveAsText expects a "space, tab, comma, semicolon, or bar".
_separators = new char[] { ' ', '\t', ',', ';', '|', };
}

/// <summary>
/// Read from Enumerable: In-Memory objects can be read as enumerables into an IDatView.
/// </summary>
[Fact]
public void ReadFromIEnumerable()
{
var mlContext = new MLContext(seed: 1, conc: 1);

// Read the dataset from an enumerable.
var data = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());

Common.AssertTypeTestDataset(data);
}

/// <summary>
/// Export to Enumerable: IDatViews can be exported as enumerables of a class.
/// </summary>
[Fact]
public void ExportToIEnumerable()
{
var mlContext = new MLContext(seed: 1, conc: 1);

// Read the dataset from an enumerable.
var enumerableBefore = TypeTestData.GenerateDataset();
var data = mlContext.Data.ReadFromEnumerable(enumerableBefore);

// Export back to an enumerable.
var enumerableAfter = mlContext.CreateEnumerable<TypeTestData>(data, true);

Common.AssertEqual(enumerableBefore, enumerableAfter);
}

/// <summary>
/// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file.
/// </summary>
/// <remarks>
/// Tests the roundtrip through a file using explicit schematization.
/// </remarks>
[Fact]
public void WriteToAndReadFromADelimetedFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());

foreach (var separator in _separators)
{
// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
var dataAfter = TypeTestData.GetTextLoader(mlContext, separator).Read(filePath);
Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}
}

/// <summary>
/// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
/// </summary>
/// <remarks>
/// Tests the roundtrip through a file using schema inference.
/// </remarks>
[Fact]
public void WriteToAndReadASchemaFromADelimitedFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());

foreach (var separator in _separators)
{
// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
var dataAfter = mlContext.Data.ReadFromTextFile<TypeTestData>(filePath, hasHeader: true, separatorChar: separator);
Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}
}

/// <summary>
/// Wrie to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
/// </summary>
[Fact]
public void WriteAndReadAFromABinaryFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(TypeTestData.GenerateDataset());

// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore);
var dataAfter = mlContext.Data.ReadFromBinary(filePath);
Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}

private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator)
{
var filePath = GetOutputPath(Path.GetRandomFileName());
using (var file = File.Create(filePath))
mlContext.Data.SaveAsText(data, file, separatorChar: separator, headerRow: true);

return filePath;
}

private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data)
{
var filePath = GetOutputPath(Path.GetRandomFileName());
using (var file = File.Create(filePath))
mlContext.Data.SaveAsBinary(data, file);

return filePath;
}
}
}
170 changes: 170 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;

namespace Microsoft.ML.Functional.Tests.Datasets
{
/// <summary>
/// A class containing one property per <see cref="DataKind"/>.
/// </summary>
/// <remarks>
/// This class has annotations for automatic deserialization from a file, and contains helper methods
/// for reading from a file and for generating a random dataset as an IEnumerable.
/// </remarks>
internal sealed class TypeTestData
{
private const int _numFeatures = 10;

[LoadColumn(0)]
public bool Label { get; set; }

[LoadColumn(1)]
public sbyte I1 { get; set; }

[LoadColumn(2)]
public byte U1 { get; set; }

[LoadColumn(3)]
public short I2 { get; set; }

[LoadColumn(4)]
public ushort U2 { get; set; }

[LoadColumn(5)]
public int I4 { get; set; }

[LoadColumn(6)]
public uint U4 { get; set; }

[LoadColumn(7)]
public long I8 { get; set; }

[LoadColumn(8)]
public ulong U8 { get; set; }

[LoadColumn(9)]
public float R4 { get; set; }

[LoadColumn(10)]
public double R8 { get; set; }

[LoadColumn(11)]
public ReadOnlyMemory<char> Tx { get; set; }

[LoadColumn(12)]
public TimeSpan Ts { get; set; }

[LoadColumn(13)]
public DateTime Dt { get; set; }

[LoadColumn(14)]
public DateTimeOffset Dz { get; set; }

[LoadColumn(15)]
public RowId Ug { get; set; }

[LoadColumn(16, 16 + _numFeatures - 1), VectorType(_numFeatures)]
public float[] Features { get; set; }


/// <summary>
/// Get the text loader for the <see cref="TypeTestData"/> dataset.
/// </summary>
/// <param name="mlContext">The ML Context.</param>
/// <param name="separator">The Separator to read with.</param>
/// <returns></returns>
public static TextLoader GetTextLoader(MLContext mlContext, char separator)
{
return mlContext.Data.CreateTextLoader(
new[] {
new TextLoader.Column("Label", DataKind.Bool, 0),
new TextLoader.Column("I1", DataKind.I1, 1),
new TextLoader.Column("U1", DataKind.U1, 2),
new TextLoader.Column("I2", DataKind.I2, 3),
new TextLoader.Column("U2", DataKind.U2, 4),
new TextLoader.Column("I4", DataKind.I4, 5),
new TextLoader.Column("U4", DataKind.U4, 6),
new TextLoader.Column("I8", DataKind.I8, 7),
new TextLoader.Column("U8", DataKind.U8, 8),
new TextLoader.Column("R4", DataKind.R4, 9),
new TextLoader.Column("R8", DataKind.R8, 10),
new TextLoader.Column("Tx", DataKind.TX, 11),
new TextLoader.Column("Ts", DataKind.TS, 12),
new TextLoader.Column("Dt", DataKind.DT, 13),
new TextLoader.Column("Dz", DataKind.DZ, 14),
new TextLoader.Column("Ug", DataKind.UG, 15),
new TextLoader.Column("Features", DataKind.R4, 16, 16 + _numFeatures-1),
},
hasHeader: true,
separatorChar: separator);
}

/// <summary>
/// Generate an IEnumerable of <see cref="TypeTestData"/>.
/// </summary>
/// <param name="numExamples">The number of <see cref="TypeTestData"/> objects to make.</param>
/// <param name="seed">The random seed.</param>
/// <returns>An IEnumerable of <see cref="TypeTestData"/>.</returns>
public static IEnumerable<TypeTestData> GenerateDataset(int numExamples = 5, int seed = 1)
{
var rng = new Random(seed);
for (int i = 0; i < numExamples; i++)
{
yield return GetRandomInstance(rng);
}
}

/// <summary>
/// Get a random instance of <see cref="TypeTestData"/>.
/// </summary>
/// <param name="rng">A <see cref="Random"/> object.</param>
/// <returns></returns>
public static TypeTestData GetRandomInstance(Random rng)
{
if (rng == null)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (rng == null) [](start = 12, length = 16)

We usually use Check()?

throw new ArgumentNullException("rng");

return new TypeTestData
{
Label = rng.NextDouble() > 0.5,
I1 = (sbyte)rng.Next(),
U1 = (byte)rng.Next(),
I2 = (short)rng.Next(),
U2 = (ushort)rng.Next(),
I4 = rng.Next(),
U4 = (uint)rng.Next(),
I8 = (long)rng.Next(),
U8 = (ulong)rng.Next(),
R4 = (float)rng.NextDouble(),
R8 = (double)rng.NextDouble(),
Tx = GetRandomCharSpan(rng),
Ts = TimeSpan.FromSeconds(rng.NextDouble() * (1 + rng.Next())),
Dt = DateTime.FromOADate(rng.Next(657435, 2958465)),
Dz = DateTimeOffset.FromUnixTimeSeconds((long)(rng.NextDouble() * (1 + rng.Next()))),
Ug = new RowId((ulong)rng.Next(), (ulong)rng.Next()),
Features = GetRandomFloatArray(rng, _numFeatures),
};
}

private static ReadOnlyMemory<char> GetRandomCharSpan(Random rng, int length = 10)
{
var chars = new char[length];
for (int i = 0; i < length; i++)
chars[i] = (char)(32 + rng.Next(0, 94)); // From space to ~.
return new ReadOnlyMemory<char>(chars);
}

private static float[] GetRandomFloatArray(Random rng, int length)
{
var floatArray = new float[length];
for (int i = 0; i < length; i++)
floatArray[i] = (float)rng.NextDouble();
return floatArray;
}
}
}

This file was deleted.