Skip to content

Commit c90fa51

Browse files
authored
Make text loaders consistent (#2710)
* Make text loaders consistent * Address comments * Remove a redundant ctor of TextLoader * Fix tests
1 parent 559c292 commit c90fa51

File tree

13 files changed

+178
-121
lines changed

13 files changed

+178
-121
lines changed

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,30 +1085,6 @@ private bool HasHeader
10851085
private readonly IHost _host;
10861086
private const string RegistrationName = "TextLoader";
10871087

1088-
/// <summary>
1089-
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
1090-
/// </summary>
1091-
/// <param name="env">The environment to use.</param>
1092-
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
1093-
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
1094-
/// <param name="hasHeader">Whether the file has a header.</param>
1095-
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
1096-
/// <param name="allowQuoting">Whether the content of a column can be parsed from a string starting and ending with quote.</param>
1097-
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
1098-
internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator,
1099-
bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse,
1100-
bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null)
1101-
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample)
1102-
{
1103-
}
1104-
1105-
private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting)
1106-
{
1107-
Contracts.AssertValue(separatorChars);
1108-
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, AllowSparse = allowSparse, AllowQuoting = allowQuoting };
1109-
return result;
1110-
}
1111-
11121088
/// <summary>
11131089
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
11141090
/// </summary>
@@ -1462,9 +1438,10 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
14621438
internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
14631439
bool hasHeader = Defaults.HasHeader,
14641440
char separator = Defaults.Separator,
1465-
bool allowQuotedStrings = Defaults.AllowQuoting,
1441+
bool allowQuoting = Defaults.AllowQuoting,
14661442
bool supportSparse = Defaults.AllowSparse,
1467-
bool trimWhitespace = Defaults.TrimWhitespace)
1443+
bool trimWhitespace = Defaults.TrimWhitespace,
1444+
IMultiStreamSource dataSample = null)
14681445
{
14691446
var userType = typeof(TInput);
14701447

@@ -1519,13 +1496,13 @@ internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
15191496
{
15201497
HasHeader = hasHeader,
15211498
Separators = new[] { separator },
1522-
AllowQuoting = allowQuotedStrings,
1499+
AllowQuoting = allowQuoting,
15231500
AllowSparse = supportSparse,
15241501
TrimWhitespace = trimWhitespace,
15251502
Columns = columns.ToArray()
15261503
};
15271504

1528-
return new TextLoader(host, options);
1505+
return new TextLoader(host, options, dataSample: dataSample);
15291506
}
15301507

15311508
private sealed class BoundLoader : IDataLoader

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,31 @@ public static class TextLoaderSaverCatalog
1818
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
1919
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
2020
/// <param name="hasHeader">Whether the file has a header.</param>
21-
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
22-
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
2321
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
22+
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
23+
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
24+
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
2425
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
2526
TextLoader.Column[] columns,
2627
char separatorChar = TextLoader.Defaults.Separator,
2728
bool hasHeader = TextLoader.Defaults.HasHeader,
28-
bool allowSparse = TextLoader.Defaults.AllowSparse,
29+
IMultiStreamSource dataSample = null,
2930
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
30-
IMultiStreamSource dataSample = null)
31-
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);
31+
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
32+
bool allowSparse = TextLoader.Defaults.AllowSparse)
33+
{
34+
var options = new TextLoader.Options
35+
{
36+
Columns = columns,
37+
Separators = new[] { separatorChar },
38+
HasHeader = hasHeader,
39+
AllowQuoting = allowQuoting,
40+
TrimWhitespace = trimWhitespace,
41+
AllowSparse = allowSparse
42+
};
43+
44+
return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
45+
}
3246

3347
/// <summary>
3448
/// Create a text loader <see cref="TextLoader"/>.
@@ -47,79 +61,98 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
4761
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
4862
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
4963
/// <param name="hasHeader">Does the file contains header?</param>
64+
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
5065
/// <param name="allowQuoting">Whether the input may include quoted values,
5166
/// which can contain separator characters, colons,
5267
/// and distinguish empty values from missing values. When true, consecutive separators
5368
/// denote a missing value and an empty value is denoted by \"\".
5469
/// When false, consecutive separators denote an empty value.</param>
70+
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
5571
/// <param name="allowSparse">Whether the input may include sparse representations for example,
5672
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
5773
/// except for 3rd and 5th columns which have values 6 and 3</param>
58-
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
5974
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
6075
char separatorChar = TextLoader.Defaults.Separator,
6176
bool hasHeader = TextLoader.Defaults.HasHeader,
77+
IMultiStreamSource dataSample = null,
6278
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
63-
bool allowSparse = TextLoader.Defaults.AllowSparse,
64-
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
65-
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace);
79+
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
80+
bool allowSparse = TextLoader.Defaults.AllowSparse)
81+
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
82+
allowSparse, trimWhitespace, dataSample: dataSample);
6683

6784
/// <summary>
6885
/// Read a data view from a text file using <see cref="TextLoader"/>.
6986
/// </summary>
7087
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
88+
/// <param name="path">The path to the file.</param>
7189
/// <param name="columns">The columns of the schema.</param>
72-
/// <param name="hasHeader">Whether the file has a header.</param>
7390
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
74-
/// <param name="path">The path to the file.</param>
91+
/// <param name="hasHeader">Whether the file has a header.</param>
92+
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
93+
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
94+
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
95+
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
7596
/// <returns>The data view.</returns>
7697
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
7798
string path,
7899
TextLoader.Column[] columns,
79100
char separatorChar = TextLoader.Defaults.Separator,
80-
bool hasHeader = TextLoader.Defaults.HasHeader)
101+
bool hasHeader = TextLoader.Defaults.HasHeader,
102+
IMultiStreamSource dataSample = null,
103+
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
104+
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
105+
bool allowSparse = TextLoader.Defaults.AllowSparse)
81106
{
82107
Contracts.CheckNonEmpty(path, nameof(path));
83108

84-
var env = catalog.GetEnvironment();
109+
var options = new TextLoader.Options
110+
{
111+
Columns = columns,
112+
Separators = new[] { separatorChar },
113+
HasHeader = hasHeader,
114+
AllowQuoting = allowQuoting,
115+
TrimWhitespace = trimWhitespace,
116+
AllowSparse = allowSparse
117+
};
85118

86-
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
87-
// Therefore, we are going to disallow data sample.
88-
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null);
119+
var reader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
89120
return reader.Read(new MultiFileSource(path));
90121
}
91122

92123
/// <summary>
93124
/// Read a data view from a text file using <see cref="TextLoader"/>.
94125
/// </summary>
95126
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
96-
/// <param name="hasHeader">Does the file contains header?</param>
127+
/// <param name="path">The path to the file.</param>
97128
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
129+
/// <param name="hasHeader">Does the file contains header?</param>
130+
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
98131
/// <param name="allowQuoting">Whether the input may include quoted values,
99132
/// which can contain separator characters, colons,
100133
/// and distinguish empty values from missing values. When true, consecutive separators
101134
/// denote a missing value and an empty value is denoted by \"\".
102135
/// When false, consecutive separators denote an empty value.</param>
136+
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
103137
/// <param name="allowSparse">Whether the input may include sparse representations for example,
104138
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
105139
/// except for 3rd and 5th columns which have values 6 and 3</param>
106-
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
107-
/// <param name="path">The path to the file.</param>
108140
/// <returns>The data view.</returns>
109141
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog,
110142
string path,
111143
char separatorChar = TextLoader.Defaults.Separator,
112144
bool hasHeader = TextLoader.Defaults.HasHeader,
145+
IMultiStreamSource dataSample = null,
113146
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
114-
bool allowSparse = TextLoader.Defaults.AllowSparse,
115-
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
147+
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
148+
bool allowSparse = TextLoader.Defaults.AllowSparse)
116149
{
117150
Contracts.CheckNonEmpty(path, nameof(path));
118151

119152
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
120153
// Therefore, we are going to disallow data sample.
121-
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace)
122-
.Read(new MultiFileSource(path));
154+
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
155+
allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample).Read(new MultiFileSource(path));
123156
}
124157

125158
/// <summary>
@@ -128,14 +161,19 @@ public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog cata
128161
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
129162
/// <param name="path">Specifies a file from which to read.</param>
130163
/// <param name="options">Defines the settings of the load operation.</param>
131-
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Options options = null)
164+
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
165+
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path,
166+
TextLoader.Options options = null, IMultiStreamSource dataSample = null)
132167
{
133168
Contracts.CheckNonEmpty(path, nameof(path));
134169

135170
var env = catalog.GetEnvironment();
136171
var source = new MultiFileSource(path);
137172

138-
return new TextLoader(env, options, source).Read(source);
173+
if (dataSample == null)
174+
return new TextLoader(env, options, source).Read(source);
175+
else
176+
return new TextLoader(env, options, dataSample).Read(source);
139177
}
140178

141179
/// <summary>

src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -439,10 +439,19 @@ internal static IDataView GetKeyDataViewOrNull(IHostEnvironment env, IChannel ch
439439
"{0} should not be specified when default loader is " + nameof(TextLoader) + ". Ignoring {0}={1}",
440440
nameof(Options.TermsColumn), src);
441441
}
442-
keyData = new TextLoader(env,
443-
columns: new[] { new TextLoader.Column("Term", DataKind.String, 0) },
444-
dataSample: fileSource)
445-
.Read(fileSource);
442+
443+
// Create text loader.
444+
var options = new TextLoader.Options()
445+
{
446+
Columns = new[]
447+
{
448+
new TextLoader.Column("Term", DataKind.String, 0)
449+
}
450+
};
451+
var reader = new TextLoader(env, options: options, dataSample: fileSource);
452+
453+
keyData = reader.Read(fileSource);
454+
446455
src = "Term";
447456
// In this case they are relying on heuristics, so auto-loading in this case is most appropriate.
448457
autoConvert = true;

src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -737,13 +737,19 @@ private IDataLoader GetLoaderForStopwords(IChannel ch, string dataFile,
737737
{
738738
if (stopwordsCol == null)
739739
stopwordsCol = "Stopwords";
740-
dataLoader = new TextLoader(
741-
Host,
742-
columns: new[]
740+
741+
// Create text loader.
742+
var options = new TextLoader.Options()
743+
{
744+
Columns = new[]
743745
{
744746
new TextLoader.Column(stopwordsCol, DataKind.String, 0)
745747
},
746-
dataSample: fileSource).Read(fileSource) as IDataLoader;
748+
Separators = new[] { ',' },
749+
};
750+
var reader = new TextLoader(Host, options: options, dataSample: fileSource);
751+
752+
dataLoader = reader.Read(fileSource) as IDataLoader;
747753
}
748754
ch.AssertNonEmpty(stopwordsCol);
749755
}

0 commit comments

Comments
 (0)