-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Change Default Settings in TextLoader #2630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7034a09
e954b86
39beedd
1cabfff
a537d53
fdd08cf
efe8019
e880f3b
7ce8a5d
ddf3a10
36829ab
dd29269
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,15 +16,19 @@ public static class TextLoaderSaverCatalog | |
/// </summary> | ||
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param> | ||
/// <param name="hasHeader">Whether the file has a header.</param> | ||
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param> | ||
/// <param name="hasHeader">Whether the file has a header.</param> | ||
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param> | ||
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param> | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, | ||
TextLoader.Column[] columns, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these go before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I reorder those arguments based on their usage frequencies (in my mind). In general, the more ML.NET-specific the later an argument appears. In reply to: 259418984 [](ancestors = 259418984) |
||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
IMultiStreamSource dataSample = null) | ||
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); | ||
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample); | ||
|
||
/// <summary> | ||
/// Create a text loader <see cref="TextLoader"/>. | ||
|
@@ -41,24 +45,24 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, | |
/// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type. | ||
/// </summary> | ||
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="separatorChar">Column separator character. Default is '\t'</param> | ||
/// <param name="allowQuotedStrings">Whether the input may include quoted values, | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="allowQuoting">Whether the input may include quoted values, | ||
/// which can contain separator characters, colons, | ||
/// and distinguish empty values from missing values. When true, consecutive separators | ||
/// denote a missing value and an empty value is denoted by \"\". | ||
/// When false, consecutive separators denote an empty value.</param> | ||
/// <param name="supportSparse">Whether the input may include sparse representations for example, | ||
/// <param name="allowSparse">Whether the input may include sparse representations for example, | ||
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero | ||
/// except for 3rd and 5th columns which have values 6 and 3</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting, | ||
bool supportSparse = TextLoader.Defaults.AllowSparse, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) | ||
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); | ||
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace); | ||
|
||
/// <summary> | ||
/// Read a data view from a text file using <see cref="TextLoader"/>. | ||
|
@@ -72,16 +76,16 @@ public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog cat | |
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, | ||
string path, | ||
TextLoader.Column[] columns, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
char separatorChar = TextLoader.Defaults.Separator) | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader) | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
var env = catalog.GetEnvironment(); | ||
|
||
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here. | ||
// Therefore, we are going to disallow data sample. | ||
var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null); | ||
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null); | ||
return reader.Read(new MultiFileSource(path)); | ||
} | ||
|
||
|
@@ -91,30 +95,30 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, | |
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="separatorChar">Column separator character. Default is '\t'</param> | ||
/// <param name="allowQuotedStrings">Whether the input may include quoted values, | ||
/// <param name="allowQuoting">Whether the input may include quoted values, | ||
/// which can contain separator characters, colons, | ||
/// and distinguish empty values from missing values. When true, consecutive separators | ||
/// denote a missing value and an empty value is denoted by \"\". | ||
/// When false, consecutive separators denote an empty value.</param> | ||
/// <param name="supportSparse">Whether the input may include sparse representations for example, | ||
/// <param name="allowSparse">Whether the input may include sparse representations for example, | ||
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero | ||
/// except for 3rd and 5th columns which have values 6 and 3</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
/// <param name="path">The path to the file.</param> | ||
/// <returns>The data view.</returns> | ||
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog, | ||
string path, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting, | ||
bool supportSparse = TextLoader.Defaults.AllowSparse, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here. | ||
// Therefore, we are going to disallow data sample. | ||
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace) | ||
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace) | ||
.Read(new MultiFileSource(path)); | ||
} | ||
|
||
|
@@ -144,20 +148,22 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, str | |
/// <param name="headerRow">Whether to write the header row.</param> | ||
/// <param name="schema">Whether to write the header comment with the schema.</param> | ||
/// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param> | ||
/// <param name="forceDense">Whether to save columns in dense format even if they are sparse vectors.</param> | ||
public static void SaveAsText(this DataOperationsCatalog catalog, | ||
IDataView data, | ||
Stream stream, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool headerRow = TextLoader.Defaults.HasHeader, | ||
bool schema = true, | ||
bool keepHidden = false) | ||
char separatorChar = TextSaver.Defaults.Separator, | ||
bool headerRow = TextSaver.Defaults.OutputHeader, | ||
bool schema = TextSaver.Defaults.OutputSchema, | ||
bool keepHidden = false, | ||
bool forceDense = TextSaver.Defaults.ForceDense) | ||
{ | ||
Contracts.CheckValue(catalog, nameof(catalog)); | ||
Contracts.CheckValue(data, nameof(data)); | ||
Contracts.CheckValue(stream, nameof(stream)); | ||
|
||
var env = catalog.GetEnvironment(); | ||
var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema }); | ||
var saver = new TextSaver(env, new TextSaver.Arguments { Dense = forceDense, Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema }); | ||
|
||
using (var ch = env.Start("Saving data")) | ||
DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,25 +22,33 @@ namespace Microsoft.ML.Data.IO | |
[BestFriend] | ||
internal sealed class TextSaver : IDataSaver | ||
{ | ||
internal static class Defaults | ||
{ | ||
internal const char Separator = '\t'; | ||
internal const bool ForceDense = false; | ||
internal const bool OutputSchema = true; | ||
internal const bool OutputHeader = true; | ||
} | ||
|
||
// REVIEW: consider saving a command line in a separate file. | ||
public sealed class Arguments | ||
{ | ||
[Argument(ArgumentType.AtMostOnce, HelpText = "Separator", ShortName = "sep")] | ||
public string Separator = "tab"; | ||
public string Separator = Defaults.Separator.ToString(); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that what we had before was correct, I think that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Force dense format", ShortName = "dense")] | ||
public bool Dense; | ||
public bool Dense = Defaults.ForceDense; | ||
|
||
// REVIEW: This and the corresponding BinarySaver option should be removed, | ||
// with the silence being handled, somehow, at the environment level. (Task 6158846.) | ||
[Argument(ArgumentType.LastOccurenceWins, HelpText = "Suppress any info output (not warnings or errors)", Hide = true)] | ||
public bool Silent; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Output the comment containing the loader settings", ShortName = "schema")] | ||
public bool OutputSchema = true; | ||
public bool OutputSchema = Defaults.OutputSchema; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Output the header", ShortName = "header")] | ||
public bool OutputHeader = true; | ||
public bool OutputHeader = Defaults.OutputHeader; | ||
} | ||
|
||
internal const string Summary = "Writes data into a text file."; | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ReadFromTextFile
calls these options:We should be consistent in the names everywhere. #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. I also checked other
quote
andsparse
in this file.In reply to: 259422639 [](ancestors = 259422639)