Skip to content

Change Default Settings in TextLoader #2630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,10 @@ private class AdultData

// Read the data into a data view.
var trainData = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
// First line of the file is a header, not a data row.
hasHeader: true,
// Default separator is tab, but we need a semicolon.
separatorChar: ';'
separatorChar: ';',
// First line of the file is a header, not a data row.
hasHeader: true
);

```
Expand Down Expand Up @@ -328,7 +328,7 @@ In the file above, the last column (12th) is label that we predict, and all the
// First, we define the reader: specify the data columns and where to find them in the text file.
// Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed.
var trainData = mlContext.Data.ReadFromTextFile<AdultData>(dataPath,
// First line of the file is a header, not a data row.
// Default separator is tab, but the dataset has comma.
separatorChar: ','
);

Expand Down Expand Up @@ -372,7 +372,7 @@ Assuming the example above was used to train the model, here's how you calculate
```csharp
// Read the test dataset.
var testData = mlContext.Data.ReadFromTextFile<AdultData>(testDataPath,
// First line of the file is a header, not a data row.
// Default separator is tab, but the dataset has comma.
separatorChar: ','
);
// Calculate metrics of the model on the test data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ public static void Example()

// This is the dictionary to convert words into the integer indexes.
var lookupMap = mlContext.Data.ReadFromTextFile(Path.Combine(modelLocation, "imdb_word_index.csv"),
columns: new[]
columns: new[]
{
new TextLoader.Column("Words", DataKind.TX, 0),
new TextLoader.Column("Ids", DataKind.I4, 1),
},
separatorChar: ','
separatorChar: ','
);

// Load the TensorFlow model once.
Expand Down
23 changes: 13 additions & 10 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Model;
using Float = System.Single;

[assembly: LoadableClass(TextLoader.Summary, typeof(IDataLoader), typeof(TextLoader), typeof(TextLoader.Options), typeof(SignatureDataLoader),
"Text Loader", "TextLoader", "Text", DocName = "loader/TextLoader.md")]
Expand Down Expand Up @@ -487,8 +486,8 @@ internal bool IsValid()

internal static class Defaults
{
internal const bool AllowQuoting = true;
internal const bool AllowSparse = true;
internal const bool AllowQuoting = false;
internal const bool AllowSparse = false;
internal const char Separator = '\t';
internal const bool HasHeader = false;
internal const bool TrimWhitespace = false;
Expand Down Expand Up @@ -1065,18 +1064,22 @@ private bool HasHeader
/// </summary>
/// <param name="env">The environment to use.</param>
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <param name="allowQuoting">Whether the content of a column can be parsed from a string starting and ending with quote.</param>
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
internal TextLoader(IHostEnvironment env, Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }), dataSample)
internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator,
bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse,
bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null)
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample)
{
}

private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars)
private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting)
{
Contracts.AssertValue(separatorChars);
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars};
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, AllowSparse = allowSparse, AllowQuoting = allowQuoting };
return result;
}

Expand Down Expand Up @@ -1345,7 +1348,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
// char[]: separators
// bindings
int cbFloat = ctx.Reader.ReadInt32();
host.CheckDecode(cbFloat == sizeof(Float));
host.CheckDecode(cbFloat == sizeof(float));
_maxRows = ctx.Reader.ReadInt64();
host.CheckDecode(_maxRows > 0);
_flags = (OptionFlags)ctx.Reader.ReadUInt32();
Expand Down Expand Up @@ -1408,7 +1411,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
// int: number of separators
// char[]: separators
// bindings
ctx.Writer.Write(sizeof(Float));
ctx.Writer.Write(sizeof(float));
ctx.Writer.Write(_maxRows);
_host.Assert((_flags & ~OptionFlags.All) == 0);
ctx.Writer.Write((uint)_flags);
Expand Down
54 changes: 30 additions & 24 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ public static class TextLoaderSaverCatalog
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
Copy link
Member

@eerhardt eerhardt Feb 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReadFromTextFile calls these options:

            bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting,
            bool supportSparse = TextLoader.Defaults.AllowSparse,

We should be consistent in the names everywhere. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I also checked other quote and sparse in this file.


In reply to: 259422639 [](ancestors = 259422639)

/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
TextLoader.Column[] columns,
bool hasHeader = TextLoader.Defaults.HasHeader,
char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowSparse = TextLoader.Defaults.AllowSparse,
Copy link
Member

@eerhardt eerhardt Feb 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these go before dataSample? #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reorder those arguments based on their usage frequencies (in my mind). In general, the more ML.NET-specific the later an argument appears.


In reply to: 259418984 [](ancestors = 259418984)

bool allowQuoting = TextLoader.Defaults.AllowQuoting,
IMultiStreamSource dataSample = null)
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);

/// <summary>
/// Create a text loader <see cref="TextLoader"/>.
Expand All @@ -41,24 +45,24 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type.
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="hasHeader">Does the file contains header?</param>
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
/// <param name="allowQuotedStrings">Whether the input may include quoted values,
/// <param name="hasHeader">Does the file contains header?</param>
/// <param name="allowQuoting">Whether the input may include quoted values,
/// which can contain separator characters, colons,
/// and distinguish empty values from missing values. When true, consecutive separators
/// denote a missing value and an empty value is denoted by \"\".
/// When false, consecutive separators denote an empty value.</param>
/// <param name="supportSparse">Whether the input may include sparse representations for example,
/// <param name="allowSparse">Whether the input may include sparse representations for example,
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
/// except for 3rd and 5th columns which have values 6 and 3</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
bool hasHeader = TextLoader.Defaults.HasHeader,
char separatorChar = TextLoader.Defaults.Separator,
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting,
bool supportSparse = TextLoader.Defaults.AllowSparse,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace);
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace);

/// <summary>
/// Read a data view from a text file using <see cref="TextLoader"/>.
Expand All @@ -72,16 +76,16 @@ public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog cat
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
string path,
TextLoader.Column[] columns,
bool hasHeader = TextLoader.Defaults.HasHeader,
char separatorChar = TextLoader.Defaults.Separator)
char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader)
{
Contracts.CheckNonEmpty(path, nameof(path));

var env = catalog.GetEnvironment();

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null);
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null);
return reader.Read(new MultiFileSource(path));
}

Expand All @@ -91,30 +95,30 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="hasHeader">Does the file contains header?</param>
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
/// <param name="allowQuotedStrings">Whether the input may include quoted values,
/// <param name="allowQuoting">Whether the input may include quoted values,
/// which can contain separator characters, colons,
/// and distinguish empty values from missing values. When true, consecutive separators
/// denote a missing value and an empty value is denoted by \"\".
/// When false, consecutive separators denote an empty value.</param>
/// <param name="supportSparse">Whether the input may include sparse representations for example,
/// <param name="allowSparse">Whether the input may include sparse representations for example,
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
/// except for 3rd and 5th columns which have values 6 and 3</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
/// <param name="path">The path to the file.</param>
/// <returns>The data view.</returns>
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog,
string path,
bool hasHeader = TextLoader.Defaults.HasHeader,
char separatorChar = TextLoader.Defaults.Separator,
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting,
bool supportSparse = TextLoader.Defaults.AllowSparse,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
{
Contracts.CheckNonEmpty(path, nameof(path));

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace)
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace)
.Read(new MultiFileSource(path));
}

Expand Down Expand Up @@ -144,20 +148,22 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, str
/// <param name="headerRow">Whether to write the header row.</param>
/// <param name="schema">Whether to write the header comment with the schema.</param>
/// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
/// <param name="forceDense">Whether to save columns in dense format even if they are sparse vectors.</param>
public static void SaveAsText(this DataOperationsCatalog catalog,
IDataView data,
Stream stream,
char separatorChar = TextLoader.Defaults.Separator,
bool headerRow = TextLoader.Defaults.HasHeader,
bool schema = true,
bool keepHidden = false)
char separatorChar = TextSaver.Defaults.Separator,
bool headerRow = TextSaver.Defaults.OutputHeader,
bool schema = TextSaver.Defaults.OutputSchema,
bool keepHidden = false,
bool forceDense = TextSaver.Defaults.ForceDense)
{
Contracts.CheckValue(catalog, nameof(catalog));
Contracts.CheckValue(data, nameof(data));
Contracts.CheckValue(stream, nameof(stream));

var env = catalog.GetEnvironment();
var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema });
var saver = new TextSaver(env, new TextSaver.Arguments { Dense = forceDense, Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema });

using (var ch = env.Start("Saving data"))
DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden);
Expand Down
16 changes: 12 additions & 4 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,33 @@ namespace Microsoft.ML.Data.IO
[BestFriend]
internal sealed class TextSaver : IDataSaver
{
internal static class Defaults
{
internal const char Separator = '\t';
internal const bool ForceDense = false;
internal const bool OutputSchema = true;
internal const bool OutputHeader = true;
}

// REVIEW: consider saving a command line in a separate file.
public sealed class Arguments
{
[Argument(ArgumentType.AtMostOnce, HelpText = "Separator", ShortName = "sep")]
public string Separator = "tab";
public string Separator = Defaults.Separator.ToString();

Copy link
Contributor

@artidoro artidoro Feb 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that what we had before was correct, I think that '\t'.ToString() simply gives a tab as a string. Here it was meant to be an understandable description of separators. We since moved to separator chars in TextLoader and such, in which case we use the character directly to define the separator. Here I believe we have to keep it like it was "tab". #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok.


In reply to: 258714604 [](ancestors = 258714604)

[Argument(ArgumentType.AtMostOnce, HelpText = "Force dense format", ShortName = "dense")]
public bool Dense;
public bool Dense = Defaults.ForceDense;

// REVIEW: This and the corresponding BinarySaver option should be removed,
// with the silence being handled, somehow, at the environment level. (Task 6158846.)
[Argument(ArgumentType.LastOccurenceWins, HelpText = "Suppress any info output (not warnings or errors)", Hide = true)]
public bool Silent;

[Argument(ArgumentType.AtMostOnce, HelpText = "Output the comment containing the loader settings", ShortName = "schema")]
public bool OutputSchema = true;
public bool OutputSchema = Defaults.OutputSchema;

[Argument(ArgumentType.AtMostOnce, HelpText = "Output the header", ShortName = "header")]
public bool OutputHeader = true;
public bool OutputHeader = Defaults.OutputHeader;
}

internal const string Summary = "Writes data into a text file.";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,6 @@ public static DataReader<IMultiStreamSource, TShape> CreateTextReader<[IsShape]
this DataOperationsCatalog catalog, Func<Context, TShape> func, IMultiStreamSource files = null,
bool hasHeader = false, char separator = '\t', bool allowQuoting = true, bool allowSparse = true,
bool trimWhitspace = false)
=> CreateReader(catalog.Environment, func, files, hasHeader, separator, allowQuoting, allowSparse, trimWhitspace);
=> CreateReader(catalog.Environment, func, files, separator, hasHeader, allowQuoting, allowSparse, trimWhitspace);
}
}
Loading