Skip to content

Internalize IDataLoader #2309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Data/Commands/DataCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,8 @@ private IDataLoader LoadTransformChain(IDataLoader srcData)
}
}

public static class LoaderUtils
[BestFriend]
internal static class LoaderUtils
{
/// <summary>
/// Saves <paramref name="loader"/> to the specified <paramref name="file"/>.
Expand Down
14 changes: 9 additions & 5 deletions src/Microsoft.ML.Data/Data/IDataLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ namespace Microsoft.ML.Data
{
/// <summary>
/// An interface for exposing some number of items that can be opened for reading.
/// </summary>
/// REVIEW: Reconcile this with the functionality exposed by IHostEnvironment. For example,
/// we could simply replace this with an array of IFileHandle.
/// </summary>

public interface IMultiStreamSource
{
/// <summary>
Expand All @@ -32,26 +33,29 @@ public interface IMultiStreamSource

/// <summary>
/// Opens the indicated item and returns a text stream reader on it.
/// REVIEW: Consider making this an extension method.
/// </summary>
/// REVIEW: Consider making this an extension method.
TextReader OpenTextReader(int index);
}

/// <summary>
/// Signature for creating an <see cref="IDataLoader"/>.
/// </summary>
public delegate void SignatureDataLoader(IMultiStreamSource data);
[BestFriend]
internal delegate void SignatureDataLoader(IMultiStreamSource data);

/// <summary>
/// Signature for loading an <see cref="IDataLoader"/>.
/// </summary>
public delegate void SignatureLoadDataLoader(ModelLoadContext ctx, IMultiStreamSource data);
[BestFriend]
internal delegate void SignatureLoadDataLoader(ModelLoadContext ctx, IMultiStreamSource data);

/// <summary>
/// Interface for a data loader. An <see cref="IDataLoader"/> can save its model information
/// and is instantiatable from arguments and an <see cref="IMultiStreamSource"/> .
/// </summary>
public interface IDataLoader : IDataView, ICanSaveModel
[BestFriend]
internal interface IDataLoader : IDataView, ICanSaveModel
{
}

Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

namespace Microsoft.ML.Data.IO
{
public sealed class BinaryLoader : IDataLoader, IDisposable
[BestFriend]
internal sealed class BinaryLoader : IDataLoader, IDisposable
{
public sealed class Arguments
{
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Data/DataLoadSave/CompositeDataLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ namespace Microsoft.ML.Data
/// The family of <c>Create</c> methods only instantiate <see cref="CompositeDataLoader"/>'s
/// when there are transforms to keep, otherwise they just return underlying loaders.
/// </summary>
public sealed class CompositeDataLoader : IDataLoader, ITransposeDataView
[BestFriend]
internal sealed class CompositeDataLoader : IDataLoader, ITransposeDataView
{
public sealed class Arguments
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ namespace Microsoft.ML.Data.IO
/// the master sub-IDV stores the overall schema, and optionally the data in row-wise format.
/// </summary>
/// <seealso cref="TransposeSaver"/>
public sealed class TransposeLoader : IDataLoader, ITransposeDataView
[BestFriend]
internal sealed class TransposeLoader : IDataLoader, ITransposeDataView
{
public sealed class Arguments
{
Expand Down
1 change: 1 addition & 0 deletions src/Microsoft.ML.Data/DataView/ZipBinding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace Microsoft.ML.Data
/// A convenience class for concatenating several schemas together.
/// This would be necessary when combining IDataViews through any type of combining operation, for example, zip.
/// </summary>
[BestFriend]
internal sealed class ZipBinding
{
private readonly Schema[] _sources;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/ValueMapping.cs
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ internal bool TryUnparse(StringBuilder sb)
}
}

public sealed class Arguments
internal sealed class Arguments
Copy link
Contributor

@artidoro artidoro Jan 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

internal [](start = 8, length = 8)

For consistency, I would either make this public and only make specific field internal, or make the Arguments internal in the other transforms too. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In other places we use Arguments classes as EntryPoint arguments. I don't want to get into business of cleaning entrypoints in this PR.
For this transform we don't have entrypoint, so I prefer to just hide it completely.


In reply to: 252398379 [](ancestors = 252398379)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sounds good!


In reply to: 252399795 [](ancestors = 252399795,252398379)

{
[Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)]
public Column[] Column;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ public abstract class ArgumentsBase : TransformInputBase
public string DataFile;

[Argument(ArgumentType.Multiple, HelpText = "Data loader", NullName = "<Auto>", SortOrder = 111, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, SignatureType = typeof(SignatureDataLoader))]
public IComponentFactory<IMultiStreamSource, IDataLoader> Loader;
[BestFriend]
internal IComponentFactory<IMultiStreamSource, IDataLoader> Loader;

[Argument(ArgumentType.AtMostOnce, HelpText = "Name of the text column containing the terms", ShortName = "termCol", SortOrder = 112, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly)]
public string TermsColumn;
Expand Down
6 changes: 4 additions & 2 deletions src/Microsoft.ML.Data/Utilities/ComponentCreation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ public static IDataView LoadTransforms(this IHostEnvironment env, Stream modelSt
/// <summary>
/// Creates a data loader from the arguments object.
/// </summary>
public static IDataLoader CreateLoader<TArgs>(this IHostEnvironment env, TArgs arguments, IMultiStreamSource files)
[BestFriend]
internal static IDataLoader CreateLoader<TArgs>(this IHostEnvironment env, TArgs arguments, IMultiStreamSource files)
Copy link
Contributor

@TomFinley TomFinley Jan 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CreateLoader [](start = 36, length = 12)

Thanks Ivan. I wouldn't necessarily do this now, but this entire assembly seems like a lot of stuff we do not want public. But not as part of this PR. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard to have internal interface IDataLoader and keep this method public.
Compiler complains on return type is less accessible than method.


In reply to: 252375310 [](ancestors = 252375310)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh what I meant was the whole class. But like I said not part of this PR.

where TArgs : class, new()
{
Contracts.CheckValue(env, nameof(env));
Expand All @@ -137,7 +138,8 @@ public static IDataLoader CreateLoader<TArgs>(this IHostEnvironment env, TArgs a
/// <summary>
/// Creates a data loader from the 'LoadName{settings}' string.
/// </summary>
public static IDataLoader CreateLoader(this IHostEnvironment env, string settings, IMultiStreamSource files)
[BestFriend]
internal static IDataLoader CreateLoader(this IHostEnvironment env, string settings, IMultiStreamSource files)
{
Contracts.CheckValue(env, nameof(env));
Contracts.CheckValue(files, nameof(files));
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Parquet/Microsoft.ML.Parquet.csproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Parquet/ParquetLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ namespace Microsoft.ML.Data
/// <summary>
/// Loads a parquet file into an IDataView. Supports basic mapping from Parquet input column data types to framework data types.
/// </summary>
public sealed class ParquetLoader : IDataLoader, IDisposable
[BestFriend]
internal sealed class ParquetLoader : IDataLoader, IDisposable
{
/// <summary>
/// A Column is a singular representation that consolidates all the related column chunks in the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ namespace Microsoft.ML.Data
/// data1.parquet
/// data1.parquet
/// </example>
public sealed class PartitionedFileLoader : IDataLoader
[BestFriend]
internal sealed class PartitionedFileLoader : IDataLoader
{
internal const string Summary = "Loads a horizontally partitioned file set.";
internal const string UserName = "Partitioned Loader";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ namespace Microsoft.ML.Data
/// <summary>
/// Delegate signature for a partitioned path parser.
/// </summary>
public delegate void PartitionedPathParser();
[BestFriend]
internal delegate void PartitionedPathParser();

/// <summary>
/// Supports extracting column names and values from a path string.
/// </summary>
public interface IPartitionedPathParser
[BestFriend]
internal interface IPartitionedPathParser
{
/// <summary>
/// Extract the column definitions from a file path.
Expand All @@ -58,12 +60,13 @@ public interface IPartitionedPathParser
}

[TlcModule.ComponentKind("PartitionedPathParser")]
public interface IPartitionedPathParserFactory : IComponentFactory<IPartitionedPathParser>
[BestFriend]
internal interface IPartitionedPathParserFactory : IComponentFactory<IPartitionedPathParser>
{
new IPartitionedPathParser CreateComponent(IHostEnvironment env);
}

public sealed class SimplePartitionedPathParser : IPartitionedPathParser, ICanSaveModel
internal sealed class SimplePartitionedPathParser : IPartitionedPathParser, ICanSaveModel
{
internal const string Summary = "A simple parser that extracts directory names as column values. Column names are defined as arguments.";
internal const string UserName = "Simple Partitioned Path Parser";
Expand Down Expand Up @@ -193,12 +196,13 @@ public IEnumerable<string> ParseValues(string path)

[TlcModule.Component(Name = ParquetPartitionedPathParser.LoadName, FriendlyName = ParquetPartitionedPathParser.UserName,
Desc = ParquetPartitionedPathParser.Summary, Alias = ParquetPartitionedPathParser.ShortName)]
public class ParquetPartitionedPathParserFactory : IPartitionedPathParserFactory
internal class ParquetPartitionedPathParserFactory : IPartitionedPathParserFactory
{
public IPartitionedPathParser CreateComponent(IHostEnvironment env) => new ParquetPartitionedPathParser();
}

public sealed class ParquetPartitionedPathParser : IPartitionedPathParser, ICanSaveModel
[BestFriend]
internal sealed class ParquetPartitionedPathParser : IPartitionedPathParser, ICanSaveModel
{
internal const string Summary = "Extract name/value pairs from Parquet formatted directory names. Example path: Year=2018/Month=12/data1.parquet";
internal const string UserName = "Parquet Partitioned Path Parser";
Expand Down Expand Up @@ -276,7 +280,6 @@ public void Save(ModelSaveContext ctx)
ctx.SaveString(sb.ToString());
};
}

public IEnumerable<PartitionedFileLoader.Column> ParseColumns(string path)
{
if (!TryParseNames(path, out List<string> names))
Expand Down
10 changes: 10 additions & 0 deletions src/Microsoft.ML.Parquet/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.CompilerServices;
using Microsoft.ML;

[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TestFramework" + PublicKey.TestValue)]
[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)]
[assembly: WantsToBeBestFriends]
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ public abstract class ArgumentsBase
public string DataFile;

[Argument(ArgumentType.Multiple, HelpText = "Data loader", NullName = "<Auto>", SortOrder = 3, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, SignatureType = typeof(SignatureDataLoader))]
public IComponentFactory<IMultiStreamSource, IDataLoader> Loader;
internal IComponentFactory<IMultiStreamSource, IDataLoader> Loader;

[Argument(ArgumentType.AtMostOnce, HelpText = "Name of the text column containing the stopwords", ShortName = "stopwordsCol", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly)]
public string StopwordsColumn;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/Text/WordBagTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ public sealed class TermLoaderArguments
public string DataFile;

[Argument(ArgumentType.Multiple, HelpText = "Data loader", NullName = "<Auto>", SortOrder = 3, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, SignatureType = typeof(SignatureDataLoader))]
public IComponentFactory<IMultiStreamSource, IDataLoader> Loader;
internal IComponentFactory<IMultiStreamSource, IDataLoader> Loader;

[Argument(ArgumentType.AtMostOnce, HelpText = "Name of the text column containing the terms", ShortName = "termCol", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly)]
public string TermsColumn;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
<ProjectReference Include="..\..\src\Microsoft.ML.KMeansClustering\Microsoft.ML.KMeansClustering.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.LightGBM\Microsoft.ML.LightGBM.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Onnx\Microsoft.ML.Onnx.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Parquet\Microsoft.ML.Parquet.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.TensorFlow\Microsoft.ML.TensorFlow.csproj" />
Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
using Microsoft.ML.Learners;
using Microsoft.ML.LightGBM;
using Microsoft.ML.Model.Onnx;
using Microsoft.ML.TimeSeries;
using Microsoft.ML.TimeSeriesProcessing;
using Microsoft.ML.Trainers;
using Microsoft.ML.Trainers.FastTree;
Expand Down Expand Up @@ -354,6 +353,7 @@ public void EntryPointCatalogCheckDuplicateParams()
Env.ComponentCatalog.RegisterAssembly(typeof(SymSgdClassificationTrainer).Assembly);
Env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly);
Env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly);
Env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);

var catalog = Env.ComponentCatalog;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.ML.RunTests;
using Microsoft.ML.Data;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.Tests
namespace Microsoft.ML.RunTests
{
public class PartitionedFileLoaderTests : TestDataPipeBase
{
protected override void Initialize()
{
base.Initialize();
Env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);
}

public PartitionedFileLoaderTests(ITestOutputHelper output)
: base(output)
{
Expand Down
32 changes: 4 additions & 28 deletions test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ private void CheckSameSchemaShape(SchemaShape promised, SchemaShape delivered)
/// * pathData defaults to breast-cancer.txt.
/// * actLoader is invoked for extra validation (if non-null).
/// </summary>
protected IDataLoader TestCore(string pathData, bool keepHidden, string[] argsPipe,
internal IDataLoader TestCore(string pathData, bool keepHidden, string[] argsPipe,
Action<IDataLoader> actLoader = null, string suffix = "", string suffixBase = null, bool checkBaseline = true,
bool forceDense = false, bool logCurs = false, bool roundTripText = true,
bool checkTranspose = false, bool checkId = true, bool baselineSchema = true, int digitsOfPrecision = DigitsOfPrecision)
Expand Down Expand Up @@ -301,7 +301,7 @@ protected IDataLoader TestCore(string pathData, bool keepHidden, string[] argsPi
return pipe1;
}

protected IDataLoader CreatePipeDataLoader(IHostEnvironment env, string pathData, string[] argsPipe, out MultiFileSource files)
private IDataLoader CreatePipeDataLoader(IHostEnvironment env, string pathData, string[] argsPipe, out MultiFileSource files)
{
VerifyArgParsing(env, argsPipe);

Expand All @@ -318,30 +318,6 @@ protected IDataLoader CreatePipeDataLoader(IHostEnvironment env, string pathData
return pipe;
}

/// <summary>
/// Apply pipe's transforms and optionally ChooseColumns transform to newView,
/// and test if pipe and newPipe have the same schema and values.
/// </summary>
protected void TestApplyTransformsToData(IHostEnvironment env, IDataLoader pipe, IDataView newView, string chooseArgs = null)
{
Contracts.AssertValue(pipe);
Contracts.AssertValue(newView);

IDataView view = pipe;
newView = ApplyTransformUtils.ApplyAllTransformsToData(env, view, newView);
if (!string.IsNullOrWhiteSpace(chooseArgs))
{
var component = new SubComponent<IDataTransform, SignatureDataTransform>("Choose", chooseArgs);
view = component.CreateInstance(env, view);
newView = component.CreateInstance(env, newView);
}

if (!CheckSameSchemas(view.Schema, newView.Schema))
Failed();
else if (!CheckSameValues(view, newView))
Failed();
}

protected void VerifyArgParsing(IHostEnvironment env, string[] strs)
{
string str = CmdParser.CombineSettings(strs);
Expand Down Expand Up @@ -448,7 +424,7 @@ protected bool SaveLoadText(IDataView view, IHostEnvironment env,
return true;
}

protected string SavePipe(IDataLoader pipe, string suffix = "", string dir = "Pipeline")
protected private string SavePipe(IDataLoader pipe, string suffix = "", string dir = "Pipeline")
{
string name = TestName + suffix + ".zip";
string pathModel = DeleteOutputPath("SavePipe", name);
Expand All @@ -471,7 +447,7 @@ protected string[] Concat(string[] args1, string[] args2)
return res;
}

protected IDataLoader LoadPipe(string pathModel, IHostEnvironment env, IMultiStreamSource files)
private IDataLoader LoadPipe(string pathModel, IHostEnvironment env, IMultiStreamSource files)
{
using (var file = Env.OpenInputFile(pathModel))
using (var strm = file.OpenReadStream())
Expand Down
1 change: 1 addition & 0 deletions test/Microsoft.ML.TestFramework/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@

[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)]
[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries.Tests" + PublicKey.TestValue)]
[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)]
4 changes: 1 addition & 3 deletions test/Microsoft.ML.Tests/Transformers/PcaTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,14 @@ namespace Microsoft.ML.Tests.Transformers
{
public sealed class PcaTests : TestDataPipeBase
{
private readonly IHostEnvironment _env;
private readonly string _dataSource;
private readonly TextSaver _saver;

public PcaTests(ITestOutputHelper helper)
: base(helper)
{
_env = new MLContext(seed: 1);
_dataSource = GetDataPath("generated_regression_dataset.csv");
_saver = new TextSaver(_env, new TextSaver.Arguments { Silent = true, OutputHeader = false });
_saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false });
}

[Fact]
Expand Down