Skip to content

Commit ec418e4

Browse files
authored
Change Default Settings in TextLoader (#2630)
* Use AllowSparse=false as default in TextLoader * Update entry point catelog * Make quote- default * TextLoader uses TextLoader's default settings * Address comments * tab to \t * Revert a weird change * Address comments * Reorder arguments * Polish cookbook * Reorder arguments in static TextLoader * Also change argument order in F#
1 parent 412e1f9 commit ec418e4

File tree

34 files changed

+169
-142
lines changed

34 files changed

+169
-142
lines changed

docs/code/MlNetCookBook.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,10 @@ private class AdultData
219219

220220
// Read the data into a data view.
221221
var trainData = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
222-
// First line of the file is a header, not a data row.
223-
hasHeader: true,
224222
// Default separator is tab, but we need a semicolon.
225-
separatorChar: ';'
223+
separatorChar: ';',
224+
// First line of the file is a header, not a data row.
225+
hasHeader: true
226226
);
227227

228228
```
@@ -328,7 +328,7 @@ In the file above, the last column (12th) is label that we predict, and all the
328328
// First, we define the reader: specify the data columns and where to find them in the text file.
329329
// Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed.
330330
var trainData = mlContext.Data.ReadFromTextFile<AdultData>(dataPath,
331-
// First line of the file is a header, not a data row.
331+
// Default separator is tab, but the dataset has comma.
332332
separatorChar: ','
333333
);
334334

@@ -372,7 +372,7 @@ Assuming the example above was used to train the model, here's how you calculate
372372
```csharp
373373
// Read the test dataset.
374374
var testData = mlContext.Data.ReadFromTextFile<AdultData>(testDataPath,
375-
// First line of the file is a header, not a data row.
375+
// Default separator is tab, but the dataset has comma.
376376
separatorChar: ','
377377
);
378378
// Calculate metrics of the model on the test data.

docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ public static void Example()
3434

3535
// This is the dictionary to convert words into the integer indexes.
3636
var lookupMap = mlContext.Data.ReadFromTextFile(Path.Combine(modelLocation, "imdb_word_index.csv"),
37-
columns: new[]
37+
columns: new[]
3838
{
3939
new TextLoader.Column("Words", DataKind.TX, 0),
4040
new TextLoader.Column("Ids", DataKind.I4, 1),
4141
},
42-
separatorChar: ','
42+
separatorChar: ','
4343
);
4444

4545
// Load the TensorFlow model once.

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

+13-10
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
using Microsoft.ML.Data;
1414
using Microsoft.ML.Internal.Utilities;
1515
using Microsoft.ML.Model;
16-
using Float = System.Single;
1716

1817
[assembly: LoadableClass(TextLoader.Summary, typeof(IDataLoader), typeof(TextLoader), typeof(TextLoader.Options), typeof(SignatureDataLoader),
1918
"Text Loader", "TextLoader", "Text", DocName = "loader/TextLoader.md")]
@@ -487,8 +486,8 @@ internal bool IsValid()
487486

488487
internal static class Defaults
489488
{
490-
internal const bool AllowQuoting = true;
491-
internal const bool AllowSparse = true;
489+
internal const bool AllowQuoting = false;
490+
internal const bool AllowSparse = false;
492491
internal const char Separator = '\t';
493492
internal const bool HasHeader = false;
494493
internal const bool TrimWhitespace = false;
@@ -1065,18 +1064,22 @@ private bool HasHeader
10651064
/// </summary>
10661065
/// <param name="env">The environment to use.</param>
10671066
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
1068-
/// <param name="hasHeader">Whether the file has a header.</param>
10691067
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
1068+
/// <param name="hasHeader">Whether the file has a header.</param>
1069+
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
1070+
/// <param name="allowQuoting">Whether the content of a column can be parsed from a string starting and ending with quote.</param>
10701071
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
1071-
internal TextLoader(IHostEnvironment env, Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
1072-
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }), dataSample)
1072+
internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator,
1073+
bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse,
1074+
bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null)
1075+
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample)
10731076
{
10741077
}
10751078

1076-
private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars)
1079+
private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting)
10771080
{
10781081
Contracts.AssertValue(separatorChars);
1079-
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars};
1082+
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, AllowSparse = allowSparse, AllowQuoting = allowQuoting };
10801083
return result;
10811084
}
10821085

@@ -1345,7 +1348,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
13451348
// char[]: separators
13461349
// bindings
13471350
int cbFloat = ctx.Reader.ReadInt32();
1348-
host.CheckDecode(cbFloat == sizeof(Float));
1351+
host.CheckDecode(cbFloat == sizeof(float));
13491352
_maxRows = ctx.Reader.ReadInt64();
13501353
host.CheckDecode(_maxRows > 0);
13511354
_flags = (OptionFlags)ctx.Reader.ReadUInt32();
@@ -1408,7 +1411,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
14081411
// int: number of separators
14091412
// char[]: separators
14101413
// bindings
1411-
ctx.Writer.Write(sizeof(Float));
1414+
ctx.Writer.Write(sizeof(float));
14121415
ctx.Writer.Write(_maxRows);
14131416
_host.Assert((_flags & ~OptionFlags.All) == 0);
14141417
ctx.Writer.Write((uint)_flags);

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs

+30-24
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,19 @@ public static class TextLoaderSaverCatalog
1616
/// </summary>
1717
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
1818
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
19-
/// <param name="hasHeader">Whether the file has a header.</param>
2019
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
20+
/// <param name="hasHeader">Whether the file has a header.</param>
21+
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
22+
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
2123
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
2224
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
2325
TextLoader.Column[] columns,
24-
bool hasHeader = TextLoader.Defaults.HasHeader,
2526
char separatorChar = TextLoader.Defaults.Separator,
27+
bool hasHeader = TextLoader.Defaults.HasHeader,
28+
bool allowSparse = TextLoader.Defaults.AllowSparse,
29+
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
2630
IMultiStreamSource dataSample = null)
27-
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);
31+
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);
2832

2933
/// <summary>
3034
/// Create a text loader <see cref="TextLoader"/>.
@@ -41,24 +45,24 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
4145
/// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type.
4246
/// </summary>
4347
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
44-
/// <param name="hasHeader">Does the file contains header?</param>
4548
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
46-
/// <param name="allowQuotedStrings">Whether the input may include quoted values,
49+
/// <param name="hasHeader">Does the file contains header?</param>
50+
/// <param name="allowQuoting">Whether the input may include quoted values,
4751
/// which can contain separator characters, colons,
4852
/// and distinguish empty values from missing values. When true, consecutive separators
4953
/// denote a missing value and an empty value is denoted by \"\".
5054
/// When false, consecutive separators denote an empty value.</param>
51-
/// <param name="supportSparse">Whether the input may include sparse representations for example,
55+
/// <param name="allowSparse">Whether the input may include sparse representations for example,
5256
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
5357
/// except for 3rd and 5th columns which have values 6 and 3</param>
5458
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
5559
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
56-
bool hasHeader = TextLoader.Defaults.HasHeader,
5760
char separatorChar = TextLoader.Defaults.Separator,
58-
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting,
59-
bool supportSparse = TextLoader.Defaults.AllowSparse,
61+
bool hasHeader = TextLoader.Defaults.HasHeader,
62+
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
63+
bool allowSparse = TextLoader.Defaults.AllowSparse,
6064
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
61-
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace);
65+
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace);
6266

6367
/// <summary>
6468
/// Read a data view from a text file using <see cref="TextLoader"/>.
@@ -72,16 +76,16 @@ public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog cat
7276
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
7377
string path,
7478
TextLoader.Column[] columns,
75-
bool hasHeader = TextLoader.Defaults.HasHeader,
76-
char separatorChar = TextLoader.Defaults.Separator)
79+
char separatorChar = TextLoader.Defaults.Separator,
80+
bool hasHeader = TextLoader.Defaults.HasHeader)
7781
{
7882
Contracts.CheckNonEmpty(path, nameof(path));
7983

8084
var env = catalog.GetEnvironment();
8185

8286
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
8387
// Therefore, we are going to disallow data sample.
84-
var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null);
88+
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null);
8589
return reader.Read(new MultiFileSource(path));
8690
}
8791

@@ -91,30 +95,30 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
9195
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
9296
/// <param name="hasHeader">Does the file contains header?</param>
9397
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
94-
/// <param name="allowQuotedStrings">Whether the input may include quoted values,
98+
/// <param name="allowQuoting">Whether the input may include quoted values,
9599
/// which can contain separator characters, colons,
96100
/// and distinguish empty values from missing values. When true, consecutive separators
97101
/// denote a missing value and an empty value is denoted by \"\".
98102
/// When false, consecutive separators denote an empty value.</param>
99-
/// <param name="supportSparse">Whether the input may include sparse representations for example,
103+
/// <param name="allowSparse">Whether the input may include sparse representations for example,
100104
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
101105
/// except for 3rd and 5th columns which have values 6 and 3</param>
102106
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
103107
/// <param name="path">The path to the file.</param>
104108
/// <returns>The data view.</returns>
105109
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog,
106110
string path,
107-
bool hasHeader = TextLoader.Defaults.HasHeader,
108111
char separatorChar = TextLoader.Defaults.Separator,
109-
bool allowQuotedStrings = TextLoader.Defaults.AllowQuoting,
110-
bool supportSparse = TextLoader.Defaults.AllowSparse,
112+
bool hasHeader = TextLoader.Defaults.HasHeader,
113+
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
114+
bool allowSparse = TextLoader.Defaults.AllowSparse,
111115
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
112116
{
113117
Contracts.CheckNonEmpty(path, nameof(path));
114118

115119
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
116120
// Therefore, we are going to disallow data sample.
117-
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace)
121+
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace)
118122
.Read(new MultiFileSource(path));
119123
}
120124

@@ -144,20 +148,22 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, str
144148
/// <param name="headerRow">Whether to write the header row.</param>
145149
/// <param name="schema">Whether to write the header comment with the schema.</param>
146150
/// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
151+
/// <param name="forceDense">Whether to save columns in dense format even if they are sparse vectors.</param>
147152
public static void SaveAsText(this DataOperationsCatalog catalog,
148153
IDataView data,
149154
Stream stream,
150-
char separatorChar = TextLoader.Defaults.Separator,
151-
bool headerRow = TextLoader.Defaults.HasHeader,
152-
bool schema = true,
153-
bool keepHidden = false)
155+
char separatorChar = TextSaver.Defaults.Separator,
156+
bool headerRow = TextSaver.Defaults.OutputHeader,
157+
bool schema = TextSaver.Defaults.OutputSchema,
158+
bool keepHidden = false,
159+
bool forceDense = TextSaver.Defaults.ForceDense)
154160
{
155161
Contracts.CheckValue(catalog, nameof(catalog));
156162
Contracts.CheckValue(data, nameof(data));
157163
Contracts.CheckValue(stream, nameof(stream));
158164

159165
var env = catalog.GetEnvironment();
160-
var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema });
166+
var saver = new TextSaver(env, new TextSaver.Arguments { Dense = forceDense, Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema });
161167

162168
using (var ch = env.Start("Saving data"))
163169
DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden);

src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs

+12-4
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,33 @@ namespace Microsoft.ML.Data.IO
2222
[BestFriend]
2323
internal sealed class TextSaver : IDataSaver
2424
{
25+
internal static class Defaults
26+
{
27+
internal const char Separator = '\t';
28+
internal const bool ForceDense = false;
29+
internal const bool OutputSchema = true;
30+
internal const bool OutputHeader = true;
31+
}
32+
2533
// REVIEW: consider saving a command line in a separate file.
2634
public sealed class Arguments
2735
{
2836
[Argument(ArgumentType.AtMostOnce, HelpText = "Separator", ShortName = "sep")]
29-
public string Separator = "tab";
37+
public string Separator = Defaults.Separator.ToString();
3038

3139
[Argument(ArgumentType.AtMostOnce, HelpText = "Force dense format", ShortName = "dense")]
32-
public bool Dense;
40+
public bool Dense = Defaults.ForceDense;
3341

3442
// REVIEW: This and the corresponding BinarySaver option should be removed,
3543
// with the silence being handled, somehow, at the environment level. (Task 6158846.)
3644
[Argument(ArgumentType.LastOccurenceWins, HelpText = "Suppress any info output (not warnings or errors)", Hide = true)]
3745
public bool Silent;
3846

3947
[Argument(ArgumentType.AtMostOnce, HelpText = "Output the comment containing the loader settings", ShortName = "schema")]
40-
public bool OutputSchema = true;
48+
public bool OutputSchema = Defaults.OutputSchema;
4149

4250
[Argument(ArgumentType.AtMostOnce, HelpText = "Output the header", ShortName = "header")]
43-
public bool OutputHeader = true;
51+
public bool OutputHeader = Defaults.OutputHeader;
4452
}
4553

4654
internal const string Summary = "Writes data into a text file.";

src/Microsoft.ML.StaticPipe/DataLoadSaveOperationsExtensions.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,6 @@ public static DataReader<IMultiStreamSource, TShape> CreateTextReader<[IsShape]
3636
this DataOperationsCatalog catalog, Func<Context, TShape> func, IMultiStreamSource files = null,
3737
bool hasHeader = false, char separator = '\t', bool allowQuoting = true, bool allowSparse = true,
3838
bool trimWhitspace = false)
39-
=> CreateReader(catalog.Environment, func, files, hasHeader, separator, allowQuoting, allowSparse, trimWhitspace);
39+
=> CreateReader(catalog.Environment, func, files, separator, hasHeader, allowQuoting, allowSparse, trimWhitspace);
4040
}
4141
}

0 commit comments

Comments
 (0)