Skip to content

Commit 091bddf

Browse files
Changed default NGram length from 1 to 2. (#5248)
* make the FastTree work better with sparse data. Is much more performant with sparse data then before, slightly less performant if its dense data * reverted FastTree changes, fixed incorrect test * Added catagorical column to wiki detox data
1 parent bb13d62 commit 091bddf

File tree

9 files changed

+280
-271
lines changed

9 files changed

+280
-271
lines changed

src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public class Options
7979

8080
public Options()
8181
{
82-
NgramLength = 1;
82+
NgramLength = 2;
8383
SkipLength = NgramExtractingEstimator.Defaults.SkipLength;
8484
UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths;
8585
MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount };

test/BaselineOutput/Common/EntryPoints/core_manifest.json

+1
Original file line numberDiff line numberDiff line change
@@ -23696,6 +23696,7 @@
2369623696
"Default": {
2369723697
"Name": "NGram",
2369823698
"Settings": {
23699+
"NgramLength": 2,
2369923700
"MaxNumTerms": [
2370023701
10000000
2370123702
]

test/BaselineOutput/Common/Text/featurized.tsv

+7-7
Large diffs are not rendered by default.

test/Microsoft.ML.Tests/TrainerEstimators/CalibratorEstimators.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ CalibratorTestData GetCalibratorTestData()
103103
var transformer = pipeline.Fit(data);
104104
var scoredData = transformer.Transform(data);
105105
var scoredDataPreview = scoredData.Preview();
106-
Assert.True(scoredDataPreview.ColumnView.Length == 5);
106+
Assert.True(scoredDataPreview.ColumnView.Length == 6);
107107

108108
return new CalibratorTestData
109109
{
@@ -128,11 +128,11 @@ private void CheckValidCalibratedData(IDataView scoredData, ITransformer transfo
128128

129129
var calibratedData = transformer.Transform(scoredData).Preview();
130130

131-
Assert.True(calibratedData.ColumnView.Length == 6);
131+
Assert.True(calibratedData.ColumnView.Length == 7);
132132

133133
for (int i = 0; i < 10; i++)
134134
{
135-
var probability = calibratedData.RowView[i].Values[5];
135+
var probability = calibratedData.RowView[i].Values[6];
136136
Assert.InRange((float)probability.Value, 0, 1);
137137
}
138138
}

test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,14 @@ public void TestLRWithStats()
101101

102102
Assert.NotNull(biasStats);
103103

104-
CompareNumbersWithTolerance(biasStats.StandardError, 0.25, digitsOfPrecision: 2);
105-
CompareNumbersWithTolerance(biasStats.ZScore, 7.97, digitsOfPrecision: 2);
104+
CompareNumbersWithTolerance(biasStats.StandardError, 0.24, digitsOfPrecision: 2);
105+
CompareNumbersWithTolerance(biasStats.ZScore, 8.32, digitsOfPrecision: 2);
106106

107107
var scoredData = transformer.Transform(dataView);
108108

109109
var coefficients = stats.GetWeightsCoefficientStatistics(100);
110110

111-
Assert.Equal(18, coefficients.Length);
111+
Assert.Equal(17, coefficients.Length);
112112

113113
foreach (var coefficient in coefficients)
114114
Assert.True(coefficient.StandardError < 1.0);

test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs

+4-2
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ public void TestEstimatorLdSvmTrainer()
197197
Columns = new[]
198198
{
199199
new TextLoader.Column("Label", DataKind.Boolean, 0),
200-
new TextLoader.Column("SentimentText", DataKind.String, 1)
200+
new TextLoader.Column("SentimentText", DataKind.String, 1),
201+
new TextLoader.Column("LoggedIn", DataKind.Boolean, 2)
201202
}
202203
}).Load(GetDataPath(TestDatasets.Sentiment.trainFilename));
203204

@@ -214,7 +215,8 @@ public void TestEstimatorLdSvmTrainer()
214215
private (IEstimator<ITransformer>, IDataView) GetOneHotBinaryClassificationPipeline()
215216
{
216217
var (pipeline, data) = GetBinaryClassificationPipeline();
217-
var oneHotPipeline = pipeline.Append(ML.Transforms.Categorical.OneHotEncoding("Features"));
218+
var oneHotPipeline = pipeline.Append(ML.Transforms.Categorical.OneHotEncoding("LoggedIn"));
219+
oneHotPipeline.Append(ML.Transforms.Concatenate("Features", "Features", "LoggedIn"));
218220

219221
return (oneHotPipeline, data);
220222
}

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -221,14 +221,14 @@ public void TextFeaturizerWithL2NormTest()
221221
var prediction = engine.Predict(data[0]);
222222
Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens));
223223
var exp1 = 0.333333343f;
224-
var exp2 = 0.707106769f;
225-
var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 };
224+
var exp2 = 0.577350259f;
225+
var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2, exp2 };
226226
Assert.Equal(expected, prediction.Features);
227227

228228
prediction = engine.Predict(data[1]);
229229
exp1 = 0.4472136f;
230230
Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens));
231-
expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f };
231+
expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 0.0f, 1.0f };
232232
Assert.Equal(expected, prediction.Features);
233233
}
234234

test/data/wikipedia-detox-250-line-data-schema.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,11 @@
1010
"ItemKind": "TX",
1111
"Purpose": "TextFeature",
1212
"ColumnRangeSelector": "1"
13+
},
14+
{
15+
"SuggestedName": "LoggedIn",
16+
"ItemKind": "Bool",
17+
"Purpose": "Categorical",
18+
"ColumnRangeSelector": "2"
1319
}
14-
]
20+
]

0 commit comments

Comments
 (0)